00001
00002
00003
00004 #include "common.h"
00005 #include "hash.h"
00006 #include "linefile.h"
00007 #include "trix.h"
00008 #include "sqlNum.h"
00009
00010
00011 struct trixHitPos
00012
00013 {
00014 struct trixHitPos *next;
00015 char *itemId;
00016 int wordIx;
00017 int leftoverLetters;
00018 };
00019
00020 struct trixWordResult
00021
00022 {
00023 struct trixWordResult *next;
00024 char *word;
00025 struct trixHitPos *hitList;
00026 struct trixHitPos *hit;
00027 struct trixHitPos *iHit;
00028 };
00029
00030 struct trixIxx
00031
00032 {
00033 off_t pos;
00034 char prefix[trixPrefixSize];
00035 };
00036
00037
00038
00039 static void trixHitPosFree(struct trixHitPos **pPos)
00040
00041 {
00042 struct trixHitPos *pos = *pPos;
00043 if (pos != NULL)
00044 {
00045 freeMem(pos->itemId);
00046 freez(pPos);
00047 }
00048 }
00049
00050 static void trixHitPosFreeList(struct trixHitPos **pList)
00051
00052 {
00053 struct trixHitPos *el, *next;
00054 for (el = *pList; el != NULL; el = next)
00055 {
00056 next = el->next;
00057 trixHitPosFree(&el);
00058 }
00059 *pList = NULL;
00060 }
00061
00062 static void trixWordResultFree(struct trixWordResult **pTwr)
00063
00064 {
00065 struct trixWordResult *twr = *pTwr;
00066 if (twr != NULL)
00067 {
00068 freeMem(twr->word);
00069 freez(pTwr);
00070 }
00071 }
00072
00073 static void trixWordResultFreeList(struct trixWordResult **pList)
00074
00075 {
00076 struct trixWordResult *el, *next;
00077 for (el = *pList; el != NULL; el = next)
00078 {
00079 next = el->next;
00080 trixWordResultFree(&el);
00081 }
00082 *pList = NULL;
00083 }
00084
00085 static void freeHitCallback(void *val)
00086
00087 {
00088 struct trixHitPos *posList = val;
00089 trixHitPosFreeList(&posList);
00090 }
00091
00092 void trixClose(struct trix **pTrix)
00093
00094 {
00095 struct trix *trix = *pTrix;
00096 if (trix != NULL)
00097 {
00098 freeMem(trix->ixx);
00099 hashTraverseVals(trix->wordHitHash, freeHitCallback);
00100 hashFree(&trix->wordHitHash);
00101 freez(pTrix);
00102 }
00103 }
00104
00105 void trixSearchResultFree(struct trixSearchResult **pTsr)
00106
00107 {
00108 struct trixSearchResult *tsr = *pTsr;
00109 if (tsr != NULL)
00110 {
00111 freeMem(tsr->itemId);
00112 freez(pTsr);
00113 }
00114 }
00115
00116 void trixSearchResultFreeList(struct trixSearchResult **pList)
00117
00118 {
00119 struct trixSearchResult *el, *next;
00120 for (el = *pList; el != NULL; el = next)
00121 {
00122 next = el->next;
00123 trixSearchResultFree(&el);
00124 }
00125 *pList = NULL;
00126 }
00127
00128
00129
00130
00131 static char unhexTable[128];
00132
00133 static void initUnhexTable()
00134
00135 {
00136 unhexTable['0'] = 0;
00137 unhexTable['1'] = 1;
00138 unhexTable['2'] = 2;
00139 unhexTable['3'] = 3;
00140 unhexTable['4'] = 4;
00141 unhexTable['5'] = 5;
00142 unhexTable['6'] = 6;
00143 unhexTable['7'] = 7;
00144 unhexTable['8'] = 8;
00145 unhexTable['9'] = 9;
00146 unhexTable['A'] = 10;
00147 unhexTable['B'] = 11;
00148 unhexTable['C'] = 12;
00149 unhexTable['D'] = 13;
00150 unhexTable['E'] = 14;
00151 unhexTable['F'] = 15;
00152 }
00153
00154 static off_t unhex(char hex[10])
00155
00156 {
00157 off_t x = 0;
00158 int i;
00159
00160 for (i=0; i<10; ++i)
00161 {
00162 x <<= 4;
00163 x += unhexTable[(unsigned)hex[i]];
00164 }
00165 return x;
00166 }
00167
00168 struct trix *trixNew()
00169
00170 {
00171 struct trix *trix;
00172 AllocVar(trix);
00173 trix->ixxAlloc = 8*1024;
00174 AllocArray(trix->ixx, trix->ixxAlloc);
00175 trix->wordHitHash = newHash(8);
00176 return trix;
00177 }
00178
00179 void trixAddToIxx(struct trix *trix, off_t pos, char *prefix)
00180
00181 {
00182 struct trixIxx *ixx;
00183 if (trix->ixxSize >= trix->ixxAlloc)
00184 {
00185 trix->ixxAlloc += trix->ixxAlloc;
00186 ExpandArray(trix->ixx, trix->ixxSize, trix->ixxAlloc);
00187 }
00188 ixx = trix->ixx + trix->ixxSize;
00189 ixx->pos = pos;
00190 memcpy(ixx->prefix, prefix, sizeof(ixx->prefix));
00191 trix->ixxSize += 1;
00192 }
00193
00194 struct trix *trixOpen(char *ixFile)
00195
00196 {
00197 char ixxFile[PATH_LEN];
00198 struct trix *trix;
00199 struct lineFile *lf;
00200 char *line;
00201
00202 initUnhexTable();
00203 safef(ixxFile, sizeof(ixxFile), "%sx", ixFile);
00204 lf = lineFileOpen(ixxFile, TRUE);
00205 trix = trixNew();
00206 while (lineFileNext(lf, &line, NULL))
00207 {
00208 off_t pos = unhex(line+trixPrefixSize);
00209 trixAddToIxx(trix, pos, line);
00210 }
00211 lineFileClose(&lf);
00212 trix->lf = lineFileOpen(ixFile, TRUE);
00213 return trix;
00214 }
00215
00216 void trixCopyToPrefix(char *word, char *prefix)
00217
00218 {
00219 int len = strlen(word);
00220 if (len >= trixPrefixSize)
00221 memcpy(prefix, word, trixPrefixSize);
00222 else
00223 {
00224 memset(prefix, ' ', trixPrefixSize);
00225 memcpy(prefix, word, len);
00226 }
00227 }
00228
00229 static off_t trixFindIndexStartLine(struct trix *trix, char *word)
00230
00231
00232 {
00233 char wordPrefix[trixPrefixSize];
00234 int i;
00235 off_t pos = 0;
00236
00237 trixCopyToPrefix(word, wordPrefix);
00238 toLowerN(wordPrefix, trixPrefixSize);
00239 for (i=0; i<trix->ixxSize; ++i)
00240 {
00241 struct trixIxx *ixx = trix->ixx + i;
00242 if (memcmp(wordPrefix, ixx->prefix, trixPrefixSize) < 0)
00243 break;
00244 pos = ixx->pos;
00245 }
00246 return pos;
00247 }
00248
00249 static struct trixHitPos *trixParseHitList(char *hitWord, char *hitString,
00250 int leftoverLetters)
00251
00252
00253 {
00254 struct trixHitPos *hit, *hitList = NULL;
00255 char *word;
00256 while ((word = nextWord(&hitString)) != NULL)
00257 {
00258 char *parts[3];
00259 int partCount;
00260 partCount = chopByChar(word, ',', parts, ArraySize(parts));
00261 if (partCount != 2)
00262 errAbort("Error in index format at word %s", hitWord);
00263 AllocVar(hit);
00264 hit->itemId = cloneString(parts[0]);
00265 hit->wordIx = sqlUnsigned(parts[1]);
00266 hit->leftoverLetters = leftoverLetters;
00267 slAddHead(&hitList, hit);
00268 }
00269 slReverse(&hitList);
00270 return hitList;
00271 }
00272
00273 int trixHitPosCmp(struct trixHitPos *a, struct trixHitPos *b)
00274
00275 {
00276 int diff = strcmp(a->itemId, b->itemId);
00277 if (diff == 0)
00278 {
00279 diff = a->wordIx - b->wordIx;
00280 if (diff == 0)
00281 diff = a->leftoverLetters - b->leftoverLetters;
00282 }
00283 return diff;
00284 }
00285
00286
00287 struct trixHitPos *mergeHits(struct trixHitPos *aList, struct trixHitPos *bList)
00288
00289
00290 {
00291 struct trixHitPos *a, *b, *aNext, *bNext, *newList = NULL;
00292
00293 a = aList;
00294 b = bList;
00295 for (;;)
00296 {
00297 if (a == NULL)
00298 {
00299 if (b == NULL)
00300 break;
00301 bNext = b->next;
00302 slAddHead(&newList, b);
00303 b = bNext;
00304 }
00305 else if (b == NULL)
00306 {
00307 aNext = a->next;
00308 slAddHead(&newList, a);
00309 a = aNext;
00310 }
00311 else if (trixHitPosCmp(a, b) < 0)
00312 {
00313 aNext = a->next;
00314 slAddHead(&newList, a);
00315 a = aNext;
00316 }
00317 else
00318 {
00319 bNext = b->next;
00320 slAddHead(&newList, b);
00321 b = bNext;
00322 }
00323 }
00324 slReverse(&newList);
00325 return newList;
00326 }
00327
00328 static int reasonablePrefix(char *prefix, char *word, boolean expand)
00329
00330
00331
00332 {
00333 int prefixLen = strlen(prefix);
00334 int wordLen = strlen(word);
00335 int suffixLen = wordLen - prefixLen;
00336 if (suffixLen == 0)
00337 return 0;
00338 else if (expand && prefixLen >= 3)
00339 {
00340 int wordEnd;
00341 char *suffix = word + prefixLen;
00342 boolean prefixEndsInDigit = isdigit(word[prefixLen-1]);
00343
00344
00345 for (wordEnd=0; wordEnd < suffixLen; ++wordEnd)
00346 {
00347 char c = suffix[wordEnd];
00348 if (c == '-' || c == '.' || c == '_' || (!prefixEndsInDigit && isdigit(c)))
00349 break;
00350 }
00351 if (wordEnd <= 2)
00352 return wordEnd;
00353 if (wordEnd == 3 && startsWith("ing", suffix))
00354 return wordEnd;
00355 }
00356 return -1;
00357 }
00358
00359
00360 struct trixWordResult *trixSearchWordResults(struct trix *trix,
00361 char *searchWord, boolean expand)
00362
00363 {
00364 char *line, *word;
00365 struct trixWordResult *twr = NULL;
00366 struct trixHitPos *hitList = hashFindVal(trix->wordHitHash, searchWord);
00367
00368 if (hitList == NULL)
00369 {
00370 struct trixHitPos *oneHitList;
00371 off_t ixPos = trixFindIndexStartLine(trix, searchWord);
00372 lineFileSeek(trix->lf, ixPos, SEEK_SET);
00373 while (lineFileNext(trix->lf, &line, NULL))
00374 {
00375 word = nextWord(&line);
00376 if (startsWith(searchWord, word))
00377 {
00378 int leftoverLetters = reasonablePrefix(searchWord, word, expand);
00379
00380 if (leftoverLetters >= 0)
00381 {
00382 oneHitList = trixParseHitList(searchWord, line,
00383 leftoverLetters);
00384 hitList = mergeHits(hitList, oneHitList);
00385 }
00386 }
00387 else if (strcmp(searchWord, word) < 0)
00388 break;
00389 }
00390 hashAdd(trix->wordHitHash, searchWord, hitList);
00391 }
00392 if (hitList != NULL)
00393 {
00394 AllocVar(twr);
00395 twr->word = cloneString(searchWord);
00396 twr->hitList = hitList;
00397 }
00398 return twr;
00399 }
00400
00401
00402 #ifdef DEBUG
00403 void trwDump(struct trixWordResult *twr)
00404
00405 {
00406 struct trixHitPos *hit;
00407 int hitIx, maxHits = 8;
00408
00409 printf("%d matches to %s:", slCount(twr->hitList), twr->word);
00410 for (hit=twr->hitList, hitIx=0; hit != NULL && hitIx < maxHits; hit=hit->next, hitIx+=1)
00411 printf(" %s@%d", hit->itemId, hit->wordIx);
00412 if (hit != NULL)
00413 printf(" ...");
00414 printf("<BR>\n");
00415 }
00416 #endif
00417
00418 static char *highestId(struct trixWordResult *twrList)
00419
00420 {
00421 char *itemId = twrList->hit->itemId;
00422 struct trixWordResult *twr;
00423
00424 for (twr = twrList->next; twr != NULL; twr = twr->next)
00425 {
00426 if (strcmp(itemId, twr->hit->itemId) < 0)
00427 itemId = twr->hit->itemId;
00428 }
00429 return itemId;
00430 }
00431
00432 static boolean seekOneToId(struct trixWordResult *twr, char *itemId)
00433
00434
00435 {
00436 struct trixHitPos *hit;
00437 int diff = -1;
00438 for (hit = twr->hit; hit != NULL; hit = hit->next)
00439 {
00440 diff = strcmp(itemId, hit->itemId);
00441 if (diff <= 0)
00442 break;
00443 }
00444 twr->hit = hit;
00445 return diff == 0;
00446 }
00447
00448 static boolean seekAllToId(struct trixWordResult *twrList, char *itemId)
00449
00450 {
00451 struct trixWordResult *twr;
00452 boolean allHit = TRUE;
00453 for (twr = twrList; twr != NULL; twr = twr->next)
00454 {
00455 if (!seekOneToId(twr, itemId))
00456 allHit = FALSE;
00457 }
00458 return allHit;
00459 }
00460
00461 static void seekAllPastId(struct trixWordResult *twrList, char *itemId)
00462
00463 {
00464 struct trixWordResult *twr;
00465 for (twr = twrList; twr != NULL; twr = twr->next)
00466 {
00467 struct trixHitPos *hit;
00468 for (hit = twr->hit; hit != NULL; hit = hit->next)
00469 if (!sameString(hit->itemId, itemId))
00470 break;
00471 twr->hit = hit;
00472 }
00473 }
00474
00475 static boolean anyTwrDone(struct trixWordResult *twrList)
00476
00477 {
00478 struct trixWordResult *twr;
00479 for (twr = twrList; twr != NULL; twr = twr->next)
00480 if (twr->hit == NULL)
00481 return TRUE;
00482 return FALSE;
00483 }
00484
00485 static void findUnorderedSpan(struct trixWordResult *twrList,
00486 char *itemId, int *retSpan, int *retLeftoverLetters)
00487
00488
00489 {
00490 int minSpan = BIGNUM;
00491 int leftoverLetters = 0;
00492 struct trixWordResult *twr;
00493
00494
00495
00496
00497 for (twr = twrList; twr != NULL; twr = twr->next)
00498 twr->iHit = twr->hit;
00499
00500 for (;;)
00501 {
00502 int minWord = BIGNUM, maxWord=0, span;
00503 int curLeftover = 0;
00504
00505
00506 for (twr = twrList; twr != NULL; twr = twr->next)
00507 {
00508 int curWord = twr->iHit->wordIx;
00509 if (curWord < minWord)
00510 minWord = curWord;
00511 if (curWord > maxWord)
00512 maxWord = curWord;
00513 curLeftover += twr->iHit->leftoverLetters;
00514 }
00515 span = maxWord - minWord;
00516 if (span < minSpan)
00517 {
00518 minSpan = span;
00519 leftoverLetters = curLeftover;
00520 }
00521
00522
00523 for (twr = twrList; twr != NULL; twr = twr->next)
00524 {
00525 if (twr->iHit->wordIx == minWord)
00526 {
00527 struct trixHitPos *hit = twr->iHit = twr->iHit->next;
00528 if (hit == NULL || !sameString(hit->itemId, itemId))
00529 {
00530 *retSpan = minSpan+1;
00531 *retLeftoverLetters = leftoverLetters;
00532 return;
00533 }
00534 }
00535 }
00536 }
00537 }
00538
00539 static int findWordPos(struct trixWordResult *twrList, char *itemId)
00540
00541
00542
00543
00544 {
00545 int firstWordPos = 0;
00546 struct trixWordResult *twr;
00547 for (twr = twrList; twr != NULL; twr = twr->next)
00548 {
00549 int pos = twr->hit->wordIx;
00550 if (firstWordPos < pos)
00551 firstWordPos = pos;
00552 }
00553 return firstWordPos;
00554 }
00555
00556 static int findOrderedSpan(struct trixWordResult *twrList,
00557 char *itemId)
00558
00559
00560 {
00561 int minSpan = BIGNUM;
00562 struct trixWordResult *twr;
00563
00564
00565
00566
00567 for (twr = twrList; twr != NULL; twr = twr->next)
00568 twr->iHit = twr->hit;
00569
00570 for (;;)
00571 {
00572 int startWord = twrList->iHit->wordIx;
00573 int endWord = startWord;
00574 int span;
00575 struct trixHitPos *hit;
00576
00577
00578 for (twr = twrList->next; twr != NULL; twr = twr->next)
00579 {
00580 for (hit = twr->iHit; ; hit = hit->next)
00581 {
00582 if (hit == NULL || !sameString(hit->itemId, itemId))
00583 return minSpan;
00584 if (hit->wordIx > endWord)
00585 break;
00586 }
00587 twr->iHit = hit;
00588 endWord = hit->wordIx;
00589 }
00590 span = endWord - startWord;
00591 if (span < minSpan)
00592 minSpan = span;
00593
00594
00595 hit = twrList->iHit = twrList->iHit->next;
00596 if (hit == NULL || !sameString(hit->itemId, itemId))
00597 return minSpan+1;
00598 }
00599 }
00600
00601
00602 static struct trixSearchResult *findMultipleWordHits(struct trixWordResult *twrList)
00603
00604 {
00605 struct trixWordResult *twr;
00606 struct trixSearchResult *tsList = NULL, *ts;
00607
00608
00609 for (twr = twrList; twr != NULL; twr = twr->next)
00610 twr->hit = twr->hitList;
00611
00612 for (;;)
00613 {
00614 char *itemId = highestId(twrList);
00615 if (seekAllToId(twrList, itemId))
00616 {
00617 AllocVar(ts);
00618 ts->itemId = cloneString(itemId);
00619 findUnorderedSpan(twrList, itemId,
00620 &ts->unorderedSpan, &ts->leftoverLetters);
00621 ts->orderedSpan = findOrderedSpan(twrList, itemId);
00622 ts->wordPos = findWordPos(twrList, itemId);
00623 slAddHead(&tsList, ts);
00624 }
00625 seekAllPastId(twrList, itemId);
00626 if (anyTwrDone(twrList))
00627 break;
00628 }
00629 slReverse(&tsList);
00630 return tsList;
00631 }
00632
00633 int trixSearchResultCmp(const void *va, const void *vb)
00634
00635 {
00636 const struct trixSearchResult *a = *((struct trixSearchResult **)va);
00637 const struct trixSearchResult *b = *((struct trixSearchResult **)vb);
00638 int dif;
00639 dif = a->unorderedSpan - b->unorderedSpan;
00640 if (dif == 0)
00641 {
00642 dif = a->orderedSpan - b->orderedSpan;
00643 if (dif == 0)
00644 {
00645 dif = a->leftoverLetters - b->leftoverLetters;
00646 if (dif == 0)
00647 dif = a->wordPos - b->wordPos;
00648 }
00649 }
00650
00651 return dif;
00652 }
00653
00654 struct trixSearchResult *trixSearch(struct trix *trix, int wordCount, char **words,
00655 boolean expand)
00656
00657
00658
00659
00660
00661 {
00662 struct trixWordResult *twr, *twrList = NULL;
00663 struct trixSearchResult *ts, *tsList = NULL;
00664 int wordIx;
00665 boolean gotMiss = FALSE;
00666
00667 if (wordCount == 1)
00668 {
00669 struct trixHitPos *hit;
00670 char *lastId = "";
00671 twr = twrList = trixSearchWordResults(trix, words[0], expand);
00672 if (twr == NULL)
00673 return NULL;
00674 for (hit = twr->hitList; hit != NULL; hit = hit->next)
00675 {
00676 if (!sameString(lastId, hit->itemId))
00677 {
00678 lastId = hit->itemId;
00679 AllocVar(ts);
00680 ts->itemId = hit->itemId;
00681 hit->itemId = NULL;
00682 ts->orderedSpan = 1;
00683 ts->unorderedSpan = 1;
00684 ts->wordPos = hit->wordIx;
00685 ts->leftoverLetters = hit->leftoverLetters;
00686 slAddHead(&tsList, ts);
00687 }
00688 }
00689 }
00690 else
00691 {
00692 for (wordIx=0; wordIx<wordCount; ++wordIx)
00693 {
00694 char *searchWord = words[wordIx];
00695 twr = trixSearchWordResults(trix, searchWord, expand);
00696 if (twr == NULL)
00697 {
00698 gotMiss = TRUE;
00699 break;
00700 }
00701 slAddHead(&twrList, twr);
00702 #ifdef DEBUG
00703 trwDump(twr);
00704 #endif
00705 }
00706 if (!gotMiss)
00707 {
00708 slReverse(&twrList);
00709 tsList = findMultipleWordHits(twrList);
00710 }
00711 }
00712 trixWordResultFreeList(&twrList);
00713 slSort(&tsList, trixSearchResultCmp);
00714 return tsList;
00715 }
00716
00717