include/wordLookupDFA.h File Reference

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  group

Functions

void wordLookupDFA_build (struct PSSMatrix PSSMatrix, int4 numCodes, int4 wordLength)
void wordLookupDFA_print ()
void wordLookupDFA_free ()

Variables

uint2 * wordLookupDFA_additionalQueryPositions
int4 wordLookupDFA_numAdditionalQueryPositions
groupwordLookupDFA_groups
int4 wordLookupDFA_numCodes
int4 wordLookupDFA_wordLength
int4 wordLookupDFA_numBlocks


Function Documentation

void wordLookupDFA_build ( struct PSSMatrix  PSSMatrix,
int4  numCodes,
int4  wordLength 
)

Definition at line 51 of file wordLookupDFA.c.

References initialWord::allocQueryPositions, memSingleBlock::block, codeword::codeword, neighbour::codeword, queryPosition::codewords, constants_initialAllocCodewordQueryPositions, constants_max_int2, encoding_alphabetType, encoding_nucleotide, global_malloc(), global_realloc(), aaFrequencyGroup::groups, int2, int4, PSSMatrix::length, memSingleBlock_free(), memSingleBlock_getEntry(), memSingleBlock_initialize(), memSingleBlock_newEntry(), codeword::next, group::nextGroups, group::nextWords, memSingleBlock::numEntries, initialWord::numQueryPositions, qPosList_addList(), qPosList_free(), qPosList_initialize(), qPosList_numQPosLists, qPosList_processLists(), qPosList_qPosLists, qPosList_reset(), queryPosition::queryPosition, initialWord::queryPositions, uint2, wordLookupDFA, wordLookupDFA_additionalQueryPositions, wordLookupDFA_blockAddresses, wordLookupDFA_calcFrequencyGroups(), wordLookupDFA_getNeighbours(), wordLookupDFA_groups, wordLookupDFA_numAdditionalQueryPositions, wordLookupDFA_numBlocks, wordLookupDFA_numCodes, and wordLookupDFA_wordLength.

Referenced by main().

00052 {
00053         struct initialWord* initialWords, *initialWord;
00054     int4 numExternalPositions = 0;
00055     unsigned char *currentBlock, *currentPosition;
00056     int2 *startExternalPositions, *endExternalPositions;
00057     int2 *externalPositions[numCodes];
00058         int4 count, count2, groupCount, groupCodeword, match;
00059         struct aaFrequencyGroup* aaFrequencyGroups;
00060     int4 codeword, codeword2, queryPosition, *groupList1, *groupList2 = NULL;
00061         unsigned char code;
00062         int4 queryPositionCount;
00063     struct neighbour* neighbours = NULL;
00064     int4 numNeighbours;
00065     int4 numWords, numGroups;
00066         struct memSingleBlock** groupsHash, **groupLists, *groupList;
00067         struct memSingleBlock* queryPositionList;
00068     struct queryPosition* queryPositionEntry;
00069     struct codeword* oldCodeword;
00070 
00071     // Record word lookup structure size details
00072     wordLookupDFA_numCodes = numCodes;
00073     wordLookupDFA_wordLength = wordLength;
00074 
00075     // Start with no additional query positions
00076         wordLookupDFA_additionalQueryPositions = NULL;
00077     wordLookupDFA_numAdditionalQueryPositions = 0;
00078 
00079     // Maximum number of entry codewords and group codewords
00080     numWords = ceil(pow(numCodes, wordLength));
00081         numGroups = ceil(pow(numCodes, wordLength - 1));
00082 
00083     // Declare memory to construct initial words
00084         initialWords = (struct initialWord*)global_malloc(sizeof(struct initialWord)
00085                  * numWords);
00086 
00087     // Declare memory to hold list of neighbours for current query word
00088     if (encoding_alphabetType != encoding_nucleotide)
00089         neighbours = (struct neighbour*)global_malloc(sizeof(struct neighbour) * numWords);
00090 
00091     // Stage 1 - Build initial word structure which contains, for each gram
00092     // a list of query positions that are high scoring
00093 
00094     // Iterate through every possible codeword
00095     codeword = 0;
00096     while (codeword < numWords)
00097     {
00098         // Initialize list of query positions as empty
00099                 initialWords[codeword].queryPositions = NULL;
00100                 initialWords[codeword].numQueryPositions = 0;
00101                 initialWords[codeword].allocQueryPositions = 0;
00102 
00103         codeword++;
00104     }
00105 
00106     // Slide a word-sized window across the query
00107     queryPosition = 0;
00108     while (queryPosition < PSSMatrix.length - wordLength + 1)
00109     {
00110         // Get neighbours for the current query word
00111         numNeighbours = 0;
00112         wordLookupDFA_getNeighbours(PSSMatrix, queryPosition, &numNeighbours,
00113                                     neighbours);
00114 
00115         while (numNeighbours > 0)
00116         {
00117                 numNeighbours--;
00118             codeword = neighbours[numNeighbours].codeword;
00119 
00120             initialWord = initialWords + codeword;
00121 
00122             initialWord->numQueryPositions++;
00123 
00124             // Allocate memory to store additional query position if needed
00125             if (initialWord->numQueryPositions > initialWord->allocQueryPositions)
00126             {
00127                 initialWord->allocQueryPositions
00128                     += constants_initialAllocCodewordQueryPositions;
00129                 initialWord->queryPositions = (int2*)global_realloc(initialWord->queryPositions,
00130                     sizeof(int2) * initialWord->allocQueryPositions);
00131             }
00132 
00133             // Add query position to codeword (record position at END of word)
00134             initialWord->queryPositions[initialWord->numQueryPositions - 1] =
00135                 queryPosition + wordLength - 1;
00136 
00137             // Record total number of query positions that will not fit into
00138             // a unsigned char and will need to be stored externally
00139             if (initialWord->numQueryPositions == 1)
00140             {
00141                 numExternalPositions += 2;
00142             }
00143             else
00144             {
00145                 numExternalPositions++;
00146             }
00147         }
00148 
00149         queryPosition++;
00150     }
00151 
00152     free(neighbours);
00153 
00154     // Contruct array of list of groups that share the same first query position
00155     groupsHash = (struct memSingleBlock**)global_malloc(sizeof(struct memSingleBlock*) * PSSMatrix.length);
00156 
00157     // Initialize each array cell to an empty list
00158     queryPosition = 0;
00159     while (queryPosition < PSSMatrix.length)
00160     {
00161                 groupsHash[queryPosition] = memSingleBlock_initialize(sizeof(struct memSingleBlock*), 10);
00162         queryPosition++;
00163         }
00164 
00165     // Go through each group
00166     groupCodeword = 0;
00167     while (groupCodeword < numGroups)
00168     {
00169         // Find the first query position in the group
00170         code = 0;
00171         queryPosition = 0;
00172         while (code < numCodes && queryPosition == 0)
00173         {
00174             codeword = groupCodeword * numCodes + code;
00175 
00176             // Found an entry
00177             if (initialWords[codeword].numQueryPositions > 0)
00178             {
00179                                 queryPosition = initialWords[codeword].queryPositions[0];
00180             }
00181 
00182             code++;
00183         }
00184 
00185         // Add group's code to hash using that query position
00186         // (If no query positions, add to hash under the zero entry)
00187         groupList = memSingleBlock_initialize(sizeof(struct memSingleBlock*), 1);
00188         *((int4*)memSingleBlock_newEntry(groupList)) = groupCodeword;
00189                 *((struct memSingleBlock**)memSingleBlock_newEntry(groupsHash[queryPosition])) = groupList;
00190 
00191                 groupCodeword++;
00192     }
00193 
00194     // For each entry in the hash table
00195     queryPosition = 0;
00196     while (queryPosition < PSSMatrix.length)
00197     {
00198         groupLists = (struct memSingleBlock**)groupsHash[queryPosition]->block;
00199         // For each group entry A in that hash
00200         count = 0;
00201         while (count < groupsHash[queryPosition]->numEntries)
00202         {
00203                         // For each group entry B after A in the hash
00204             count2 = count + 1;
00205                         while(count2 < groupsHash[queryPosition]->numEntries)
00206             {
00207                 // If both entries contain at least one group
00208                 if (groupLists[count]->numEntries > 0 && groupLists[count2]->numEntries > 0)
00209                 {
00210                     // Check if they have matching query positions
00211                     match = 1;
00212                     code = 0;
00213                     while (code < numCodes)
00214                     {
00215                         // For each word in the group
00216                         groupList1 = (int4*)(groupLists[count]->block);
00217                         groupList2 = (int4*)(groupLists[count2]->block);
00218 
00219                         codeword = groupList1[0] * numCodes + code;
00220                         codeword2 = groupList2[0] * numCodes + code;
00221 
00222                         // Check if the number of positions matches
00223                         if (initialWords[codeword].numQueryPositions !=
00224                             initialWords[codeword2].numQueryPositions)
00225                         {
00226                             // No match, break out
00227                             match = 0;
00228                             break;
00229                         }
00230 
00231                         // Move though the query positions and check for matches
00232                         queryPositionCount = 0;
00233                         while (queryPositionCount < initialWords[codeword].numQueryPositions)
00234                         {
00235                             // If they don't match at this position
00236                             if (initialWords[codeword].queryPositions[queryPositionCount] !=
00237                                 initialWords[codeword2].queryPositions[queryPositionCount])
00238                             {
00239                                 // No match, break out
00240                                 match = 0;
00241                                 break;
00242                             }
00243                             queryPositionCount++;
00244                         }
00245 
00246                         code++;
00247                     }
00248 
00249                     if (match)
00250                     {
00251                         // Move group in entry B to entry A
00252                         *((int4*)memSingleBlock_newEntry(groupLists[count])) = groupList2[0];
00253 
00254                         // Remove group from entry B
00255                         groupLists[count2]->numEntries = 0;
00256                     }
00257                                 }
00258 
00259                 count2++;
00260             }
00261 
00262                 count++;
00263         }
00264 
00265         queryPosition++;
00266         }
00267 
00268     // Calculate frequencies of all possible groups of codes length one less than
00269     // the full word length and then arrange them so the most frequent have central positions
00270     aaFrequencyGroups = wordLookupDFA_calcFrequencyGroups(groupsHash, PSSMatrix,
00271                                                           numCodes, wordLength - 1);
00272 
00273     // Declare memory for addresses of group sized blocks
00274     wordLookupDFA_blockAddresses = (unsigned char**)global_malloc(sizeof(unsigned char*) * numGroups);
00275 
00276     // Declare memory for final words
00277         wordLookupDFA = (unsigned char*)global_malloc(sizeof(char) * numWords + sizeof(int2)
00278                   * numExternalPositions + sizeof(int2) * numExternalPositions);
00279 
00280     // Declare memory to store additional query positions
00281     // (we assume worst case scenario where all positions are addition positions)
00282     wordLookupDFA_additionalQueryPositions
00283         = (uint2*)((char*)wordLookupDFA + sizeof(char) * numWords
00284           + sizeof(int2) * numExternalPositions);
00285 
00286         currentBlock = wordLookupDFA;
00287 
00288     // Initialize query position list construction
00289     qPosList_initialize(numCodes);
00290 
00291     // Stage 2 - Construct the more compact, final wordLookupDFA structure that is arranged
00292     // with high frequency words in the middle (protein only)
00293 
00294     // For each collection of groups with the same set of query positions
00295     count = 0;
00296         while (count < wordLookupDFA_numBlocks)
00297     {
00298         groupList = aaFrequencyGroups[count].groups;
00299         groupList1 = (int4*)(groupList->block);
00300 
00301         // Use first group entry for constructing the block
00302         groupCodeword = groupList1[0];
00303 
00304         // First comes the external codes.
00305         startExternalPositions = (int2*)(currentBlock);
00306         endExternalPositions = startExternalPositions;
00307 
00308                 qPosList_reset();
00309 
00310         // For each word in this block
00311         code = 0;
00312         while (code < numCodes)
00313         {
00314                 // Determine codeword
00315             codeword = groupCodeword * numCodes + code;
00316             queryPositionCount = initialWords[codeword].numQueryPositions;
00317 
00318             // Add list of query positions for this word to the qPosList structure
00319                         qPosList_addList(initialWords[codeword].queryPositions,
00320                              initialWords[codeword].numQueryPositions, code);
00321 
00322                         code++;
00323         }
00324 
00325         // Process the list of query positions to create an optimal set of lists
00326                 qPosList_processLists();
00327 
00328         // For each query position list in order from shortest to longest
00329         while (qPosList_numQPosLists > 0)
00330         {
00331                 qPosList_numQPosLists--;
00332             queryPositionList = qPosList_qPosLists + qPosList_numQPosLists;
00333 
00334             // First check that there is room for the list in external area,
00335             // plus at least 3 int2s for each remaining code all within
00336             // the range of an unsigned char (256)
00337             if (endExternalPositions - startExternalPositions + queryPositionList->numEntries
00338                 + 1 >= 256 - 3 * numCodes)
00339                         {
00340                 // If not, we need to store this word's query positions
00341                 // at a completely external location
00342 
00343                 // Copy values to additional positions address
00344                 count2 = queryPositionList->numEntries;
00345                 while (count2 > 0)
00346                 {
00347                         count2--;
00348                         queryPositionEntry = memSingleBlock_getEntry(queryPositionList, count2);
00349 
00350                                         // Copy the query position
00351                     wordLookupDFA_additionalQueryPositions[
00352                         wordLookupDFA_numAdditionalQueryPositions]
00353                         = queryPositionEntry->queryPosition;
00354 
00355                     // If a codeword list starts at this point
00356                                         while (queryPositionEntry->codewords != NULL)
00357                     {
00358                         // Get each code
00359                         code = queryPositionEntry->codewords->codeword;
00360 
00361                         // Add address information to external positions
00362                         // First flag to indicate additional positions
00363                         *endExternalPositions = 0;
00364 
00365                         // That's where the word->queryPosition will point to
00366                         externalPositions[code] = endExternalPositions;
00367                         endExternalPositions++;
00368 
00369                         // Then the offset into the array where they start
00370                         // (Record across two int2's to support very long queries)
00371                         *endExternalPositions = wordLookupDFA_numAdditionalQueryPositions / constants_max_int2;
00372                         endExternalPositions++;
00373                         *endExternalPositions = wordLookupDFA_numAdditionalQueryPositions % constants_max_int2;
00374                         endExternalPositions++;
00375 
00376                         // Free the current codeword before moving to the next
00377                         oldCodeword = queryPositionEntry->codewords;
00378                         queryPositionEntry->codewords = queryPositionEntry->codewords->next;
00379                         free(oldCodeword);
00380                     }
00381 
00382                         // Increment number of additional query positions
00383                     wordLookupDFA_numAdditionalQueryPositions++;
00384                 }
00385 
00386                 // Add terminating zero
00387                 wordLookupDFA_additionalQueryPositions[
00388                     wordLookupDFA_numAdditionalQueryPositions] = 0;
00389 
00390                 // Increment number of additional query positions
00391                 wordLookupDFA_numAdditionalQueryPositions++;
00392             }
00393             else
00394             {
00395                 // Use external location for list of query positions
00396 
00397                 // Copy values to external positions address
00398                 count2 = queryPositionList->numEntries;
00399                 while (count2 > 0)
00400                 {
00401                         count2--;
00402                         queryPositionEntry = memSingleBlock_getEntry(queryPositionList, count2);
00403 
00404                                         // Copy the query position
00405                     *endExternalPositions = queryPositionEntry->queryPosition;
00406 
00407                     // If a codeword list starts at this point
00408                                         while (queryPositionEntry->codewords != NULL)
00409                     {
00410                         // Get each code
00411                         code = queryPositionEntry->codewords->codeword;
00412 
00413                         // The word will point to this external position
00414                         externalPositions[code] = endExternalPositions;
00415 
00416                         // Free the current codeword before moving to the next
00417                         oldCodeword = queryPositionEntry->codewords;
00418                         queryPositionEntry->codewords = queryPositionEntry->codewords->next;
00419                         free(oldCodeword);
00420                                         }
00421 
00422                     endExternalPositions++;
00423                                 }
00424 
00425                 // Add zero terminal code
00426                 *endExternalPositions = 0;
00427                 endExternalPositions++;
00428             }
00429         }
00430 
00431         // Next comes the block structure
00432         currentPosition = (unsigned char*)endExternalPositions;
00433 
00434         // For each group, point to that block
00435         groupCount = 0;
00436         while (groupCount < groupList->numEntries)
00437         {
00438             wordLookupDFA_blockAddresses[groupList1[groupCount]] = currentPosition;
00439             groupCount++;
00440         }
00441 
00442         // Free the group list
00443         memSingleBlock_free(groupList);
00444 
00445         code = 0;
00446         while (code < numCodes)
00447         {
00448             codeword = groupCodeword * numCodes + code;
00449 
00450             queryPositionCount = initialWords[codeword].numQueryPositions;
00451 
00452             // No query positions, store NULL both 16-bit integers
00453             if (queryPositionCount == 0)
00454             {
00455                 *currentPosition = 0;
00456             }
00457             // At least one query position
00458             else
00459             {
00460                 // Give location of external query positions
00461                 *currentPosition
00462                         = (unsigned char)(((char*)endExternalPositions -
00463                                        (char*)externalPositions[code]) / sizeof(int2));
00464             }
00465 
00466             // Free memory used by initial structure
00467             free(initialWords[codeword].queryPositions);
00468 
00469             code++;
00470             currentPosition++;
00471         }
00472         currentBlock = currentPosition;
00473 
00474         count++;
00475         }
00476 
00477     free(initialWords);
00478     free(aaFrequencyGroups);
00479 
00480     // Free the query position lists structure
00481     qPosList_free();
00482 
00483     // New stage 3 - construct groups lookup FSA structure
00484 
00485     // Declare memory for group states
00486     wordLookupDFA_groups = (struct group*)global_malloc(sizeof(struct group) * numGroups);
00487 
00488     // For each group
00489     groupCodeword = 0;
00490     while (groupCodeword < numGroups)
00491     {
00492         // Get block associated with current group
00493         currentBlock = wordLookupDFA_blockAddresses[groupCodeword];
00494 
00495                 // Set pointer to block of words
00496         wordLookupDFA_groups[groupCodeword].nextWords = currentBlock;
00497 
00498         // Set pointer to next groups
00499                 wordLookupDFA_groups[groupCodeword].nextGroups =
00500                         wordLookupDFA_groups + ((groupCodeword * numCodes) % numGroups);
00501 
00502         groupCodeword++;
00503         }
00504 }

Here is the call graph for this function:

Here is the caller graph for this function:

void wordLookupDFA_free (  ) 

Definition at line 917 of file wordLookupDFA.c.

References wordLookupDFA, wordLookupDFA_blockAddresses, and wordLookupDFA_groups.

Referenced by main().

00918 {
00919         free(wordLookupDFA_groups);
00920     free(wordLookupDFA);
00921     free(wordLookupDFA_blockAddresses);
00922 }

Here is the caller graph for this function:

void wordLookupDFA_print (  ) 

Definition at line 798 of file wordLookupDFA.c.

References encoding_alphabetType, encoding_getLetter(), encoding_protein, int4, uint2, uint4, wordLookupDFA_additionalQueryPositions, wordLookupDFA_blockAddresses, wordLookupDFA_getCodeword(), wordLookupDFA_numBlocks, wordLookupDFA_numCodes, and wordLookupDFA_wordLength.

00799 {
00800     unsigned char *currentBlock;
00801         unsigned char codes[wordLookupDFA_wordLength + 1], code;
00802         unsigned char word;
00803         int4 groupCodeword;
00804         uint2* queryPositions;
00805     int4 totalQueryPositions = 0, totalEmptySlots = 0;
00806         int4 count;
00807     int4 numWords, numGroups;
00808     int4 numCodes, wordLength;
00809     uint4 codeCount;
00810 
00811         numCodes = wordLookupDFA_numCodes;
00812     wordLength = wordLookupDFA_wordLength;
00813 
00814     // Determine number of codewords and number of groups
00815     numGroups = ceil(pow(numCodes, wordLength - 1));
00816     numWords = ceil(pow(numCodes, wordLength));
00817 
00818     // Initialize word to first position
00819     codeCount = 0;
00820     while (codeCount <= wordLength - 1)
00821     {
00822         codes[codeCount] = 0;
00823         codeCount++;
00824     }
00825 
00826     // Iterate - For each possible group
00827         while (!codes[wordLength - 1])
00828     {
00829         // Construct the codeword for array of codes
00830                 groupCodeword = wordLookupDFA_getCodeword(codes, wordLength - 1);
00831 
00832         // Get current block
00833         currentBlock = wordLookupDFA_blockAddresses[groupCodeword];
00834 
00835         // For each word in the group
00836         code = 0;
00837         while (code < numCodes)
00838         {
00839             // Get word in the block
00840             word = currentBlock[code];
00841 
00842             if (encoding_alphabetType == encoding_protein || word != 0)
00843             {
00844                 //  Print the word
00845                 codeCount = 0;
00846                 while (codeCount < wordLength - 1)
00847                 {
00848                     printf("%c", encoding_getLetter(codes[codeCount]));
00849                     codeCount++;
00850                 }
00851                 printf("%c", encoding_getLetter(code));
00852 
00853                 printf(" QueryPositions ");
00854                 fflush(stdout);
00855 
00856                 printf("[%d]: ", word);
00857 
00858                 if (word == 0)
00859                 {
00860                     // No query positions
00861                     totalEmptySlots++;
00862                 }
00863                 else
00864                 {
00865                     // At least one position at an external address
00866                     queryPositions = ((uint2*)currentBlock) - word;
00867 
00868                     // If the zero flag is stored at the first query position
00869                     if (!*queryPositions)
00870                     {
00871                         // Go to an outside address for additional positions
00872                                                 queryPositions = wordLookupDFA_additionalQueryPositions
00873                                        + *(queryPositions + 1);
00874                     }
00875 
00876                     count = 0;
00877                     while (queryPositions[count] != 0)
00878                     {
00879                         printf("%d ", queryPositions[count] - 1);
00880                         totalQueryPositions++;
00881                         fflush(stdout);
00882                         count++;
00883                     }
00884                 }
00885 
00886                 printf("\n");
00887             }
00888 
00889             code++;
00890                 }
00891 
00892         // Move to next word
00893                 codes[0]++;
00894         codeCount = 0;
00895         while (codeCount < wordLength - 1)
00896         {
00897                         if (codes[codeCount] >= numCodes)
00898             {
00899                 codes[codeCount] = 0;
00900                 codes[codeCount + 1]++;
00901             }
00902             else
00903             {
00904                 break;
00905             }
00906             codeCount++;
00907         }
00908     }
00909 
00910     printf("Total query positions=%d\n", totalQueryPositions);
00911     printf("Empty slots=%d/%d (%d%%)\n", totalEmptySlots, numWords,
00912                                          totalEmptySlots * 100 / numWords);
00913         printf("Number of blocks/groups=%d/%d\n", wordLookupDFA_numBlocks, numGroups);
00914 }

Here is the call graph for this function:


Variable Documentation

uint2* wordLookupDFA_additionalQueryPositions

Definition at line 14 of file wordLookupDFA.c.

Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), and wordLookupDFA_print().

struct group* wordLookupDFA_groups

Definition at line 16 of file wordLookupDFA.c.

Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), and wordLookupDFA_free().

int4 wordLookupDFA_numAdditionalQueryPositions

Definition at line 15 of file wordLookupDFA.c.

Referenced by wordLookupDFA_build().

int4 wordLookupDFA_numBlocks

Definition at line 17 of file wordLookupDFA.c.

Referenced by wordLookupDFA_build(), wordLookupDFA_calcFrequencyGroups(), and wordLookupDFA_print().

int4 wordLookupDFA_numCodes

Definition at line 17 of file wordLookupDFA.c.

Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), wordLookupDFA_findNeighbours(), wordLookupDFA_getCodes(), wordLookupDFA_getCodeword(), and wordLookupDFA_print().

int4 wordLookupDFA_wordLength

Definition at line 17 of file wordLookupDFA.c.

Referenced by wordLookupDFA_build(), wordLookupDFA_findNeighbours(), wordLookupDFA_getNeighbours(), and wordLookupDFA_print().


Generated on Wed Dec 19 20:53:14 2007 for fsa-blast by  doxygen 1.5.2