Thursday, 6 September 2012

Using the word breaker from Microsoft SQL Server 2005 in a stand-alone C# program


I have been writing a program which will query a full text search index on Microsoft SQL Server 2005. The data being indexed are large blobs of text (multiple lines).
I need to extract only those lines of text which contain the search words for display with the results.
Full Text Search uses a word breaker (IWordBreaker COM interface) and stemmer (IStemmer COM interface) to break the indexed text into words, and to enable matching of alternate word forms (e.g. "welcome" will also match "welcomes", "welcomed" and "welcoming").
I therefore need to use the Sql Server word breaker and stemmer to determine if each line of text from the blob matches the search string, as I believe the standard word breaker and stemmer in Windows search uses a different algorithm.
I thought if I copied the two dlls (infosoft and langwrbk) from a SQL Server installation to the machine on which I am running my program, and ran regsvr32 on them, they would be installed in the registry, and I would be able to use them. Unfortunately, this does not work, as regsvr32 does not add any information to the registry.
I therefore needed to load the COM components in the dlls "by hand". I started with John Jeffery's code to load COM components by hand. Once I had got over some 32-bit/64-bit problems (your DLL has to match the bit size of your calling code), it worked.
Here is the LinqPad script I used to test the two interfaces, in case someone else needs to do the same thing:

const string SqlServerDllFolder = @"C:\Program Files\Microsoft SQL Server\MSSQL.1\MSSQL\Binn";
static LibraryModule langwrbk = LibraryModule.LoadModule(Path.Combine(SqlServerDllFolder, "LangWrbk.dll"));
static LibraryModule infosoft = LibraryModule.LoadModule(Path.Combine(SqlServerDllFolder, "infosoft.dll"));
static Guid stemmer = new Guid("D99F7670-7F1A-11CE-BE57-00AA0051FE20");
static Guid breaker = new Guid("173C97E2-AEBE-437C-9445-01B237ABF2F6");

void Main()
{
 WordBreaker breaker = new WordBreaker();
 string search;
 while((search = Util.ReadLine("Query string:").Trim()) != "") {
  Console.WriteLine("Original text:" + search);
  foreach (string word in breaker.Search(search)) {
   Console.WriteLine(word);
  }
 }
}

// COM Interface to Microsoft word breaker and stemmer
[Flags]
public enum WORDREP_BREAK_TYPE {
 WORDREP_BREAK_EOW = 0,
 WORDREP_BREAK_EOS = 1,
 WORDREP_BREAK_EOP = 2,
 WORDREP_BREAK_EOC = 3
}

[ComImport]
[Guid("CC907054-C058-101A-B554-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IWordSink {
 void PutWord([MarshalAs(UnmanagedType.U4)] int cwc,
  [MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf,
  [MarshalAs(UnmanagedType.U4)] int cwcSrcLen,
  [MarshalAs(UnmanagedType.U4)] int cwcSrcPos);
 void PutAltWord([MarshalAs(UnmanagedType.U4)] int cwc,
  [MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf,
  [MarshalAs(UnmanagedType.U4)] int cwcSrcLen,
  [MarshalAs(UnmanagedType.U4)] int cwcSrcPos);
 void StartAltPhrase();
 void EndAltPhrase();
 void PutBreak(WORDREP_BREAK_TYPE breakType);
}

[ComImport]
[Guid("CC906FF0-C058-101A-B554-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPhraseSink {
 void PutSmallPhrase([MarshalAs(UnmanagedType.LPWStr)] string pwcNoun,
  [MarshalAs(UnmanagedType.U4)] int cwcNoun,
  [MarshalAs(UnmanagedType.LPWStr)] string pwcModifier,
  [MarshalAs(UnmanagedType.U4)] int cwcModifier,
  [MarshalAs(UnmanagedType.U4)] int ulAttachmentType);
 void PutPhrase([MarshalAs(UnmanagedType.LPWStr)] string pwcPhrase,
  [MarshalAs(UnmanagedType.U4)] int cwcPhrase);
}
[ComImport]
[Guid("fe77c330-7f42-11ce-be57-00aa0051fe20")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IWordFormSink {
 void PutAltWord([MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf, [MarshalAs(UnmanagedType.U4)] int cwc);
 void PutWord([MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf, [MarshalAs(UnmanagedType.U4)] int cwc);
}

[StructLayout(LayoutKind.Sequential)]
public struct TEXT_SOURCE {
 [MarshalAs(UnmanagedType.FunctionPtr)]
 public delFillTextBuffer pfnFillTextBuffer;
 [MarshalAs(UnmanagedType.LPWStr)]
 public string awcBuffer;
 [MarshalAs(UnmanagedType.U4)]
 public int iEnd;
 [MarshalAs(UnmanagedType.U4)]
 public int iCur;
}

// used to fill the buffer for TEXT_SOURCE
public delegate uint delFillTextBuffer([MarshalAs(UnmanagedType.Struct)]
 ref TEXT_SOURCE pTextSource);

[ComImport]
[Guid("D53552C8-77E3-101A-B552-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IWordBreaker {
 void Init([MarshalAs(UnmanagedType.Bool)] bool fQuery,
  [MarshalAs(UnmanagedType.U4)] int maxTokenSize,
  [MarshalAs(UnmanagedType.Bool)] out bool pfLicense);
 void BreakText([MarshalAs(UnmanagedType.Struct)] ref TEXT_SOURCE pTextSource,
  [MarshalAs(UnmanagedType.Interface)] IWordSink pWordSink,
  [MarshalAs(UnmanagedType.Interface)] IPhraseSink pPhraseSink);
 void GetLicenseToUse([MarshalAs(UnmanagedType.LPWStr)] out string ppwcsLicense);
}

[ComImport]
[Guid("EFBAF140-7F42-11CE-BE57-00AA0051FE20")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IStemmer {
 void Init([MarshalAs(UnmanagedType.U4)] int ulMaxTokenSize, [MarshalAs(UnmanagedType.Bool)] out bool pfLicense);
 void GenerateWordForms([MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf, 
  [MarshalAs(UnmanagedType.U4)] int cwc, 
  [MarshalAs(UnmanagedType.Interface)] IWordFormSink pStemSink);
 void GetLicenseToUse([MarshalAs(UnmanagedType.LPWStr)] out string ppwcsLicense);
}

// Word breaker to break a line of text into words, stem them, 
// and return a HashSet of all the words and stems
public class WordBreaker : IWordSink, IWordFormSink {
 IWordBreaker wordBreaker;
 HashSet words; // Words to search for
 IStemmer istm;
 
 public WordBreaker() {
  wordBreaker = (IWordBreaker)ComHelper.CreateInstance(langwrbk, breaker);
  bool pfLicense = true;
  wordBreaker.Init(true, 1000, out pfLicense);
  istm = (IStemmer)ComHelper.CreateInstance(infosoft, stemmer);
  pfLicense = true;
  istm.Init(1000, out pfLicense);
 }
 
 public HashSet Search(string text) {
  words = new HashSet();
  TEXT_SOURCE pTextSource = new TEXT_SOURCE();
  pTextSource.pfnFillTextBuffer = new delFillTextBuffer(pfnFillTextBuffer);
  pTextSource.awcBuffer = text.ToLower();
  pTextSource.iCur = 0;
  pTextSource.iEnd = text.Length;
  wordBreaker.BreakText(ref pTextSource, (IWordSink)this, null);
  return words;
 }
 
 #region IWordFormSink Members
 public void PutAltWord(string pwcInBuf, int cwc) {
  words.Add(pwcInBuf.Substring(0, cwc));
 }

 public void PutWord(string pwcInBuf, int cwc) {
  words.Add(pwcInBuf.Substring(0, cwc));
 }
 #endregion

 #region IWordSink Members
 public void PutWord(int cwc, string pwcInBuf, int cwcSrcLen, int cwcSrcPos) {
  istm.GenerateWordForms(pwcInBuf.Substring(0, cwc), cwc, this);
 }
 
 public void PutAltWord(int cwc, string pwcInBuf, int cwcSrcLen, int cwcSrcPos) {
 }

 public void StartAltPhrase() {
 }
 
 public void EndAltPhrase() {
 }

 public void PutBreak(WORDREP_BREAK_TYPE breakType) {
 }
 #endregion
}

static uint pfnFillTextBuffer(ref TEXT_SOURCE pTextSource) {
 // return WBREAK_E_END_OF_TEXT
 return 0x80041780;
}

// Code from https://gist.github.com/1568627
// By John Jeffery
static class ComHelper
{
 private delegate int DllGetClassObject(ref Guid clsid, ref Guid iid, [Out, MarshalAs(UnmanagedType.Interface)] out IClassFactory classFactory);

 public static object CreateInstance(LibraryModule libraryModule, Guid clsid)
 {
  var classFactory = GetClassFactory(libraryModule, clsid);
  var iid = new Guid("00000000-0000-0000-C000-000000000046"); // IUnknown
  object obj;
  classFactory.CreateInstance(null, ref iid, out obj);
  return obj;
 }

 static IClassFactory GetClassFactory(LibraryModule libraryModule, Guid clsid)
 {
  IntPtr ptr = libraryModule.GetProcAddress("DllGetClassObject");
  var callback = (DllGetClassObject) Marshal.GetDelegateForFunctionPointer(ptr, typeof (DllGetClassObject));

  var classFactoryIid = new Guid("00000001-0000-0000-c000-000000000046");
  IClassFactory classFactory;
  var hresult = callback(ref clsid, ref classFactoryIid, out classFactory);

  if (hresult != 0)
  {
   throw new Win32Exception(hresult, "Cannot create class factory");
  }
  return classFactory;
 }
}


[Guid("00000001-0000-0000-c000-000000000046")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
[ComImport]
interface IClassFactory
{
 void CreateInstance([MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref Guid riid, [MarshalAs(UnmanagedType.IUnknown)] out object ppvObject);
 void LockServer(bool fLock);
}

class LibraryModule : IDisposable
{
 private readonly IntPtr _handle;
 private readonly string _filePath;

 private static class Win32
 {
  [DllImport("kernel32.dll", CharSet = CharSet.Ansi, SetLastError = true)]
  public static extern IntPtr GetProcAddress(IntPtr hModule, string lpProcName);

  [DllImport("kernel32.dll")]
  public static extern bool FreeLibrary(IntPtr hModule);

  [DllImport("kernel32.dll", SetLastError = true)]
  public static extern IntPtr LoadLibrary(string lpFileName);
 }


 public static LibraryModule LoadModule(string filePath)
 {
  var libraryModule = new LibraryModule(Win32.LoadLibrary(filePath), filePath);
  if (libraryModule._handle == IntPtr.Zero)
  {
   int error = Marshal.GetLastWin32Error();
   throw new Win32Exception(error, "Cannot load library: " + filePath);
  }

  return libraryModule;
 }

 private LibraryModule(IntPtr handle, string filePath)
 {
  _filePath = filePath;
  _handle = handle;
 }

 ~LibraryModule()
 {
  if (_handle != IntPtr.Zero)
  {
   Win32.FreeLibrary(_handle);
  }
 }

 public void Dispose()
 {
  if (_handle != IntPtr.Zero)
  {
   Win32.FreeLibrary(_handle);
  }
  GC.SuppressFinalize(this);
 }

 public IntPtr GetProcAddress(string name)
 {
  IntPtr ptr = Win32.GetProcAddress(_handle, "DllGetClassObject");
  if (ptr == IntPtr.Zero)
  {
   int error = Marshal.GetLastWin32Error();
   string message = string.Format("Cannot find proc {0} in {1}", name, _filePath);
   throw new Win32Exception(error, message);
  }
  return ptr;
 }

 public string FilePath
 {
  get { return _filePath; }
 }
}