This graph shows which files directly or indirectly include this file:

Go to the source code of this file.
Data Structures | |
| struct | group |
Functions | |
| void | wordLookupDFA_build (struct PSSMatrix PSSMatrix, int4 numCodes, int4 wordLength) |
| void | wordLookupDFA_print () |
| void | wordLookupDFA_free () |
Variables | |
| uint2 * | wordLookupDFA_additionalQueryPositions |
| int4 | wordLookupDFA_numAdditionalQueryPositions |
| group * | wordLookupDFA_groups |
| int4 | wordLookupDFA_numCodes |
| int4 | wordLookupDFA_wordLength |
| int4 | wordLookupDFA_numBlocks |
| void wordLookupDFA_build | ( | struct PSSMatrix | PSSMatrix, | |
| int4 | numCodes, | |||
| int4 | wordLength | |||
| ) |
Definition at line 51 of file wordLookupDFA.c.
References initialWord::allocQueryPositions, memSingleBlock::block, codeword::codeword, neighbour::codeword, queryPosition::codewords, constants_initialAllocCodewordQueryPositions, constants_max_int2, encoding_alphabetType, encoding_nucleotide, global_malloc(), global_realloc(), aaFrequencyGroup::groups, int2, int4, PSSMatrix::length, memSingleBlock_free(), memSingleBlock_getEntry(), memSingleBlock_initialize(), memSingleBlock_newEntry(), codeword::next, group::nextGroups, group::nextWords, memSingleBlock::numEntries, initialWord::numQueryPositions, qPosList_addList(), qPosList_free(), qPosList_initialize(), qPosList_numQPosLists, qPosList_processLists(), qPosList_qPosLists, qPosList_reset(), queryPosition::queryPosition, initialWord::queryPositions, uint2, wordLookupDFA, wordLookupDFA_additionalQueryPositions, wordLookupDFA_blockAddresses, wordLookupDFA_calcFrequencyGroups(), wordLookupDFA_getNeighbours(), wordLookupDFA_groups, wordLookupDFA_numAdditionalQueryPositions, wordLookupDFA_numBlocks, wordLookupDFA_numCodes, and wordLookupDFA_wordLength.
Referenced by main().
00052 { 00053 struct initialWord* initialWords, *initialWord; 00054 int4 numExternalPositions = 0; 00055 unsigned char *currentBlock, *currentPosition; 00056 int2 *startExternalPositions, *endExternalPositions; 00057 int2 *externalPositions[numCodes]; 00058 int4 count, count2, groupCount, groupCodeword, match; 00059 struct aaFrequencyGroup* aaFrequencyGroups; 00060 int4 codeword, codeword2, queryPosition, *groupList1, *groupList2 = NULL; 00061 unsigned char code; 00062 int4 queryPositionCount; 00063 struct neighbour* neighbours = NULL; 00064 int4 numNeighbours; 00065 int4 numWords, numGroups; 00066 struct memSingleBlock** groupsHash, **groupLists, *groupList; 00067 struct memSingleBlock* queryPositionList; 00068 struct queryPosition* queryPositionEntry; 00069 struct codeword* oldCodeword; 00070 00071 // Record word lookup structure size details 00072 wordLookupDFA_numCodes = numCodes; 00073 wordLookupDFA_wordLength = wordLength; 00074 00075 // Start with no additional query positions 00076 wordLookupDFA_additionalQueryPositions = NULL; 00077 wordLookupDFA_numAdditionalQueryPositions = 0; 00078 00079 // Maximum number of entry codewords and group codewords 00080 numWords = ceil(pow(numCodes, wordLength)); 00081 numGroups = ceil(pow(numCodes, wordLength - 1)); 00082 00083 // Declare memory to construct initial words 00084 initialWords = (struct initialWord*)global_malloc(sizeof(struct initialWord) 00085 * numWords); 00086 00087 // Declare memory to hold list of neighbours for current query word 00088 if (encoding_alphabetType != encoding_nucleotide) 00089 neighbours = (struct neighbour*)global_malloc(sizeof(struct neighbour) * numWords); 00090 00091 // Stage 1 - Build initial word structure which contains, for each gram 00092 // a list of query positions that are high scoring 00093 00094 // Iterate through every possible codeword 00095 codeword = 0; 00096 while (codeword < numWords) 00097 { 00098 // Initialize list of query positions as empty 00099 initialWords[codeword].queryPositions = NULL; 00100 initialWords[codeword].numQueryPositions = 0; 00101 initialWords[codeword].allocQueryPositions = 0; 00102 00103 codeword++; 00104 } 00105 00106 // Slide a word-sized window across the query 00107 queryPosition = 0; 00108 while (queryPosition < PSSMatrix.length - wordLength + 1) 00109 { 00110 // Get neighbours for the current query word 00111 numNeighbours = 0; 00112 wordLookupDFA_getNeighbours(PSSMatrix, queryPosition, &numNeighbours, 00113 neighbours); 00114 00115 while (numNeighbours > 0) 00116 { 00117 numNeighbours--; 00118 codeword = neighbours[numNeighbours].codeword; 00119 00120 initialWord = initialWords + codeword; 00121 00122 initialWord->numQueryPositions++; 00123 00124 // Allocate memory to store additional query position if needed 00125 if (initialWord->numQueryPositions > initialWord->allocQueryPositions) 00126 { 00127 initialWord->allocQueryPositions 00128 += constants_initialAllocCodewordQueryPositions; 00129 initialWord->queryPositions = (int2*)global_realloc(initialWord->queryPositions, 00130 sizeof(int2) * initialWord->allocQueryPositions); 00131 } 00132 00133 // Add query position to codeword (record position at END of word) 00134 initialWord->queryPositions[initialWord->numQueryPositions - 1] = 00135 queryPosition + wordLength - 1; 00136 00137 // Record total number of query positions that will not fit into 00138 // a unsigned char and will need to be stored externally 00139 if (initialWord->numQueryPositions == 1) 00140 { 00141 numExternalPositions += 2; 00142 } 00143 else 00144 { 00145 numExternalPositions++; 00146 } 00147 } 00148 00149 queryPosition++; 00150 } 00151 00152 free(neighbours); 00153 00154 // Contruct array of list of groups that share the same first query position 00155 groupsHash = (struct memSingleBlock**)global_malloc(sizeof(struct memSingleBlock*) * PSSMatrix.length); 00156 00157 // Initialize each array cell to an empty list 00158 queryPosition = 0; 00159 while (queryPosition < PSSMatrix.length) 00160 { 00161 groupsHash[queryPosition] = memSingleBlock_initialize(sizeof(struct memSingleBlock*), 10); 00162 queryPosition++; 00163 } 00164 00165 // Go through each group 00166 groupCodeword = 0; 00167 while (groupCodeword < numGroups) 00168 { 00169 // Find the first query position in the group 00170 code = 0; 00171 queryPosition = 0; 00172 while (code < numCodes && queryPosition == 0) 00173 { 00174 codeword = groupCodeword * numCodes + code; 00175 00176 // Found an entry 00177 if (initialWords[codeword].numQueryPositions > 0) 00178 { 00179 queryPosition = initialWords[codeword].queryPositions[0]; 00180 } 00181 00182 code++; 00183 } 00184 00185 // Add group's code to hash using that query position 00186 // (If no query positions, add to hash under the zero entry) 00187 groupList = memSingleBlock_initialize(sizeof(struct memSingleBlock*), 1); 00188 *((int4*)memSingleBlock_newEntry(groupList)) = groupCodeword; 00189 *((struct memSingleBlock**)memSingleBlock_newEntry(groupsHash[queryPosition])) = groupList; 00190 00191 groupCodeword++; 00192 } 00193 00194 // For each entry in the hash table 00195 queryPosition = 0; 00196 while (queryPosition < PSSMatrix.length) 00197 { 00198 groupLists = (struct memSingleBlock**)groupsHash[queryPosition]->block; 00199 // For each group entry A in that hash 00200 count = 0; 00201 while (count < groupsHash[queryPosition]->numEntries) 00202 { 00203 // For each group entry B after A in the hash 00204 count2 = count + 1; 00205 while(count2 < groupsHash[queryPosition]->numEntries) 00206 { 00207 // If both entries contain at least one group 00208 if (groupLists[count]->numEntries > 0 && groupLists[count2]->numEntries > 0) 00209 { 00210 // Check if they have matching query positions 00211 match = 1; 00212 code = 0; 00213 while (code < numCodes) 00214 { 00215 // For each word in the group 00216 groupList1 = (int4*)(groupLists[count]->block); 00217 groupList2 = (int4*)(groupLists[count2]->block); 00218 00219 codeword = groupList1[0] * numCodes + code; 00220 codeword2 = groupList2[0] * numCodes + code; 00221 00222 // Check if the number of positions matches 00223 if (initialWords[codeword].numQueryPositions != 00224 initialWords[codeword2].numQueryPositions) 00225 { 00226 // No match, break out 00227 match = 0; 00228 break; 00229 } 00230 00231 // Move though the query positions and check for matches 00232 queryPositionCount = 0; 00233 while (queryPositionCount < initialWords[codeword].numQueryPositions) 00234 { 00235 // If they don't match at this position 00236 if (initialWords[codeword].queryPositions[queryPositionCount] != 00237 initialWords[codeword2].queryPositions[queryPositionCount]) 00238 { 00239 // No match, break out 00240 match = 0; 00241 break; 00242 } 00243 queryPositionCount++; 00244 } 00245 00246 code++; 00247 } 00248 00249 if (match) 00250 { 00251 // Move group in entry B to entry A 00252 *((int4*)memSingleBlock_newEntry(groupLists[count])) = groupList2[0]; 00253 00254 // Remove group from entry B 00255 groupLists[count2]->numEntries = 0; 00256 } 00257 } 00258 00259 count2++; 00260 } 00261 00262 count++; 00263 } 00264 00265 queryPosition++; 00266 } 00267 00268 // Calculate frequencies of all possible groups of codes length one less than 00269 // the full word length and then arrange them so the most frequent have central positions 00270 aaFrequencyGroups = wordLookupDFA_calcFrequencyGroups(groupsHash, PSSMatrix, 00271 numCodes, wordLength - 1); 00272 00273 // Declare memory for addresses of group sized blocks 00274 wordLookupDFA_blockAddresses = (unsigned char**)global_malloc(sizeof(unsigned char*) * numGroups); 00275 00276 // Declare memory for final words 00277 wordLookupDFA = (unsigned char*)global_malloc(sizeof(char) * numWords + sizeof(int2) 00278 * numExternalPositions + sizeof(int2) * numExternalPositions); 00279 00280 // Declare memory to store additional query positions 00281 // (we assume worst case scenario where all positions are addition positions) 00282 wordLookupDFA_additionalQueryPositions 00283 = (uint2*)((char*)wordLookupDFA + sizeof(char) * numWords 00284 + sizeof(int2) * numExternalPositions); 00285 00286 currentBlock = wordLookupDFA; 00287 00288 // Initialize query position list construction 00289 qPosList_initialize(numCodes); 00290 00291 // Stage 2 - Construct the more compact, final wordLookupDFA structure that is arranged 00292 // with high frequency words in the middle (protein only) 00293 00294 // For each collection of groups with the same set of query positions 00295 count = 0; 00296 while (count < wordLookupDFA_numBlocks) 00297 { 00298 groupList = aaFrequencyGroups[count].groups; 00299 groupList1 = (int4*)(groupList->block); 00300 00301 // Use first group entry for constructing the block 00302 groupCodeword = groupList1[0]; 00303 00304 // First comes the external codes. 00305 startExternalPositions = (int2*)(currentBlock); 00306 endExternalPositions = startExternalPositions; 00307 00308 qPosList_reset(); 00309 00310 // For each word in this block 00311 code = 0; 00312 while (code < numCodes) 00313 { 00314 // Determine codeword 00315 codeword = groupCodeword * numCodes + code; 00316 queryPositionCount = initialWords[codeword].numQueryPositions; 00317 00318 // Add list of query positions for this word to the qPosList structure 00319 qPosList_addList(initialWords[codeword].queryPositions, 00320 initialWords[codeword].numQueryPositions, code); 00321 00322 code++; 00323 } 00324 00325 // Process the list of query positions to create an optimal set of lists 00326 qPosList_processLists(); 00327 00328 // For each query position list in order from shortest to longest 00329 while (qPosList_numQPosLists > 0) 00330 { 00331 qPosList_numQPosLists--; 00332 queryPositionList = qPosList_qPosLists + qPosList_numQPosLists; 00333 00334 // First check that there is room for the list in external area, 00335 // plus at least 3 int2s for each remaining code all within 00336 // the range of an unsigned char (256) 00337 if (endExternalPositions - startExternalPositions + queryPositionList->numEntries 00338 + 1 >= 256 - 3 * numCodes) 00339 { 00340 // If not, we need to store this word's query positions 00341 // at a completely external location 00342 00343 // Copy values to additional positions address 00344 count2 = queryPositionList->numEntries; 00345 while (count2 > 0) 00346 { 00347 count2--; 00348 queryPositionEntry = memSingleBlock_getEntry(queryPositionList, count2); 00349 00350 // Copy the query position 00351 wordLookupDFA_additionalQueryPositions[ 00352 wordLookupDFA_numAdditionalQueryPositions] 00353 = queryPositionEntry->queryPosition; 00354 00355 // If a codeword list starts at this point 00356 while (queryPositionEntry->codewords != NULL) 00357 { 00358 // Get each code 00359 code = queryPositionEntry->codewords->codeword; 00360 00361 // Add address information to external positions 00362 // First flag to indicate additional positions 00363 *endExternalPositions = 0; 00364 00365 // That's where the word->queryPosition will point to 00366 externalPositions[code] = endExternalPositions; 00367 endExternalPositions++; 00368 00369 // Then the offset into the array where they start 00370 // (Record across two int2's to support very long queries) 00371 *endExternalPositions = wordLookupDFA_numAdditionalQueryPositions / constants_max_int2; 00372 endExternalPositions++; 00373 *endExternalPositions = wordLookupDFA_numAdditionalQueryPositions % constants_max_int2; 00374 endExternalPositions++; 00375 00376 // Free the current codeword before moving to the next 00377 oldCodeword = queryPositionEntry->codewords; 00378 queryPositionEntry->codewords = queryPositionEntry->codewords->next; 00379 free(oldCodeword); 00380 } 00381 00382 // Increment number of additional query positions 00383 wordLookupDFA_numAdditionalQueryPositions++; 00384 } 00385 00386 // Add terminating zero 00387 wordLookupDFA_additionalQueryPositions[ 00388 wordLookupDFA_numAdditionalQueryPositions] = 0; 00389 00390 // Increment number of additional query positions 00391 wordLookupDFA_numAdditionalQueryPositions++; 00392 } 00393 else 00394 { 00395 // Use external location for list of query positions 00396 00397 // Copy values to external positions address 00398 count2 = queryPositionList->numEntries; 00399 while (count2 > 0) 00400 { 00401 count2--; 00402 queryPositionEntry = memSingleBlock_getEntry(queryPositionList, count2); 00403 00404 // Copy the query position 00405 *endExternalPositions = queryPositionEntry->queryPosition; 00406 00407 // If a codeword list starts at this point 00408 while (queryPositionEntry->codewords != NULL) 00409 { 00410 // Get each code 00411 code = queryPositionEntry->codewords->codeword; 00412 00413 // The word will point to this external position 00414 externalPositions[code] = endExternalPositions; 00415 00416 // Free the current codeword before moving to the next 00417 oldCodeword = queryPositionEntry->codewords; 00418 queryPositionEntry->codewords = queryPositionEntry->codewords->next; 00419 free(oldCodeword); 00420 } 00421 00422 endExternalPositions++; 00423 } 00424 00425 // Add zero terminal code 00426 *endExternalPositions = 0; 00427 endExternalPositions++; 00428 } 00429 } 00430 00431 // Next comes the block structure 00432 currentPosition = (unsigned char*)endExternalPositions; 00433 00434 // For each group, point to that block 00435 groupCount = 0; 00436 while (groupCount < groupList->numEntries) 00437 { 00438 wordLookupDFA_blockAddresses[groupList1[groupCount]] = currentPosition; 00439 groupCount++; 00440 } 00441 00442 // Free the group list 00443 memSingleBlock_free(groupList); 00444 00445 code = 0; 00446 while (code < numCodes) 00447 { 00448 codeword = groupCodeword * numCodes + code; 00449 00450 queryPositionCount = initialWords[codeword].numQueryPositions; 00451 00452 // No query positions, store NULL both 16-bit integers 00453 if (queryPositionCount == 0) 00454 { 00455 *currentPosition = 0; 00456 } 00457 // At least one query position 00458 else 00459 { 00460 // Give location of external query positions 00461 *currentPosition 00462 = (unsigned char)(((char*)endExternalPositions - 00463 (char*)externalPositions[code]) / sizeof(int2)); 00464 } 00465 00466 // Free memory used by initial structure 00467 free(initialWords[codeword].queryPositions); 00468 00469 code++; 00470 currentPosition++; 00471 } 00472 currentBlock = currentPosition; 00473 00474 count++; 00475 } 00476 00477 free(initialWords); 00478 free(aaFrequencyGroups); 00479 00480 // Free the query position lists structure 00481 qPosList_free(); 00482 00483 // New stage 3 - construct groups lookup FSA structure 00484 00485 // Declare memory for group states 00486 wordLookupDFA_groups = (struct group*)global_malloc(sizeof(struct group) * numGroups); 00487 00488 // For each group 00489 groupCodeword = 0; 00490 while (groupCodeword < numGroups) 00491 { 00492 // Get block associated with current group 00493 currentBlock = wordLookupDFA_blockAddresses[groupCodeword]; 00494 00495 // Set pointer to block of words 00496 wordLookupDFA_groups[groupCodeword].nextWords = currentBlock; 00497 00498 // Set pointer to next groups 00499 wordLookupDFA_groups[groupCodeword].nextGroups = 00500 wordLookupDFA_groups + ((groupCodeword * numCodes) % numGroups); 00501 00502 groupCodeword++; 00503 } 00504 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void wordLookupDFA_free | ( | ) |
Definition at line 917 of file wordLookupDFA.c.
References wordLookupDFA, wordLookupDFA_blockAddresses, and wordLookupDFA_groups.
Referenced by main().
00918 { 00919 free(wordLookupDFA_groups); 00920 free(wordLookupDFA); 00921 free(wordLookupDFA_blockAddresses); 00922 }
Here is the caller graph for this function:

| void wordLookupDFA_print | ( | ) |
Definition at line 798 of file wordLookupDFA.c.
References encoding_alphabetType, encoding_getLetter(), encoding_protein, int4, uint2, uint4, wordLookupDFA_additionalQueryPositions, wordLookupDFA_blockAddresses, wordLookupDFA_getCodeword(), wordLookupDFA_numBlocks, wordLookupDFA_numCodes, and wordLookupDFA_wordLength.
00799 { 00800 unsigned char *currentBlock; 00801 unsigned char codes[wordLookupDFA_wordLength + 1], code; 00802 unsigned char word; 00803 int4 groupCodeword; 00804 uint2* queryPositions; 00805 int4 totalQueryPositions = 0, totalEmptySlots = 0; 00806 int4 count; 00807 int4 numWords, numGroups; 00808 int4 numCodes, wordLength; 00809 uint4 codeCount; 00810 00811 numCodes = wordLookupDFA_numCodes; 00812 wordLength = wordLookupDFA_wordLength; 00813 00814 // Determine number of codewords and number of groups 00815 numGroups = ceil(pow(numCodes, wordLength - 1)); 00816 numWords = ceil(pow(numCodes, wordLength)); 00817 00818 // Initialize word to first position 00819 codeCount = 0; 00820 while (codeCount <= wordLength - 1) 00821 { 00822 codes[codeCount] = 0; 00823 codeCount++; 00824 } 00825 00826 // Iterate - For each possible group 00827 while (!codes[wordLength - 1]) 00828 { 00829 // Construct the codeword for array of codes 00830 groupCodeword = wordLookupDFA_getCodeword(codes, wordLength - 1); 00831 00832 // Get current block 00833 currentBlock = wordLookupDFA_blockAddresses[groupCodeword]; 00834 00835 // For each word in the group 00836 code = 0; 00837 while (code < numCodes) 00838 { 00839 // Get word in the block 00840 word = currentBlock[code]; 00841 00842 if (encoding_alphabetType == encoding_protein || word != 0) 00843 { 00844 // Print the word 00845 codeCount = 0; 00846 while (codeCount < wordLength - 1) 00847 { 00848 printf("%c", encoding_getLetter(codes[codeCount])); 00849 codeCount++; 00850 } 00851 printf("%c", encoding_getLetter(code)); 00852 00853 printf(" QueryPositions "); 00854 fflush(stdout); 00855 00856 printf("[%d]: ", word); 00857 00858 if (word == 0) 00859 { 00860 // No query positions 00861 totalEmptySlots++; 00862 } 00863 else 00864 { 00865 // At least one position at an external address 00866 queryPositions = ((uint2*)currentBlock) - word; 00867 00868 // If the zero flag is stored at the first query position 00869 if (!*queryPositions) 00870 { 00871 // Go to an outside address for additional positions 00872 queryPositions = wordLookupDFA_additionalQueryPositions 00873 + *(queryPositions + 1); 00874 } 00875 00876 count = 0; 00877 while (queryPositions[count] != 0) 00878 { 00879 printf("%d ", queryPositions[count] - 1); 00880 totalQueryPositions++; 00881 fflush(stdout); 00882 count++; 00883 } 00884 } 00885 00886 printf("\n"); 00887 } 00888 00889 code++; 00890 } 00891 00892 // Move to next word 00893 codes[0]++; 00894 codeCount = 0; 00895 while (codeCount < wordLength - 1) 00896 { 00897 if (codes[codeCount] >= numCodes) 00898 { 00899 codes[codeCount] = 0; 00900 codes[codeCount + 1]++; 00901 } 00902 else 00903 { 00904 break; 00905 } 00906 codeCount++; 00907 } 00908 } 00909 00910 printf("Total query positions=%d\n", totalQueryPositions); 00911 printf("Empty slots=%d/%d (%d%%)\n", totalEmptySlots, numWords, 00912 totalEmptySlots * 100 / numWords); 00913 printf("Number of blocks/groups=%d/%d\n", wordLookupDFA_numBlocks, numGroups); 00914 }
Here is the call graph for this function:

Definition at line 14 of file wordLookupDFA.c.
Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), and wordLookupDFA_print().
| struct group* wordLookupDFA_groups |
Definition at line 16 of file wordLookupDFA.c.
Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), and wordLookupDFA_free().
Definition at line 17 of file wordLookupDFA.c.
Referenced by wordLookupDFA_build(), wordLookupDFA_calcFrequencyGroups(), and wordLookupDFA_print().
Definition at line 17 of file wordLookupDFA.c.
Referenced by search_protein1hit(), search_protein2hit(), wordLookupDFA_build(), wordLookupDFA_findNeighbours(), wordLookupDFA_getCodes(), wordLookupDFA_getCodeword(), and wordLookupDFA_print().
Definition at line 17 of file wordLookupDFA.c.
Referenced by wordLookupDFA_build(), wordLookupDFA_findNeighbours(), wordLookupDFA_getNeighbours(), and wordLookupDFA_print().
1.5.2