include/index.h File Reference

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  indexCoordinate
struct  wordList
struct  queryWord

Functions

void index_initializeBuild (uint4 fromCodeword, uint4 toCodeword)
void index_addSubject (unsigned char *subject, uint4 subjectLength, uint4 fromCodeword, uint4 toCodeword)
void index_finishBuild (uint4 fromCodeword, uint4 toCodeword)
uint4 * index_wordOffsetPositions ()
uint4 index_numWordOffsets (uint4 codeword)
unsigned char * index_wordOffsets (uint4 codeword)
void index_processQuery (unsigned char *startIndex, struct PSSMatrix PSSMatrix, uint4 collectionSize)
indexCoordinateindex_getFirstCoordinate ()
indexCoordinateindex_getNextCoordinate ()
uint4 index_generateCodeword (unsigned char *word, uint4 wordSize)
void index_print ()

Variables

uint4 index_wordSize
uint4 index_intervalSize
indexCoordinate ** index_sequenceCoordinates
indexCoordinateindex_coordinates
uint4 index_numCoordinates
uint4 * index_sequencePositions
uint4 * index_descriptionLocations


Function Documentation

void index_addSubject ( unsigned char *  subject,
uint4  subjectLength,
uint4  fromCodeword,
uint4  toCodeword 
)

Definition at line 77 of file index.c.

References index_addWord(), index_generateCodeword(), index_intervalSize, index_subjectNumber, index_wordSize, and uint4.

Referenced by main().

00078 {
00079         uint4 codeword, wordOffset;
00080 
00081     // Add subjectNumber and offset to lists for each word
00082         wordOffset = 0;
00083     while (wordOffset + index_wordSize - 1 < subjectLength)
00084     {
00085         // For this word
00086         codeword = index_generateCodeword(subject + wordOffset, index_wordSize);
00087 
00088         // If it is in the range of codewords we are considering
00089         if (codeword >= fromCodeword && codeword < toCodeword)
00090         {
00091             // Record subject number and offset of word in index
00092             index_addWord(codeword, index_subjectNumber, wordOffset / index_intervalSize);// + (index_wordSize - 4));
00093                 }
00094 
00095         // Index every Nth word
00096         wordOffset += index_intervalSize;
00097     }
00098 
00099     index_subjectNumber++;
00100 }

Here is the call graph for this function:

Here is the caller graph for this function:

void index_finishBuild ( uint4  fromCodeword,
uint4  toCodeword 
)

Definition at line 58 of file index.c.

References index_words, wordList::offsets, and uint4.

Referenced by main().

00059 {
00060         uint4 codeword;
00061 
00062     // For each word
00063     codeword = fromCodeword;
00064     while (codeword < toCodeword)
00065     {
00066         // Free list
00067         free(index_words[codeword].offsets);
00068         codeword++;
00069     }
00070 
00071     // Free the lists
00072         index_words += fromCodeword;
00073     free(index_words);
00074 }

Here is the caller graph for this function:

uint4 index_generateCodeword ( unsigned char *  word,
uint4  wordSize 
)

struct indexCoordinate* index_getFirstCoordinate (  )  [read]

Definition at line 376 of file index.c.

References index_currentCoordinate, and index_getNextCoordinate().

00377 {
00378     // Reset counters
00379         index_currentCoordinate = 0;
00380 
00381     // Get coordinate
00382     return index_getNextCoordinate();
00383 }

Here is the call graph for this function:

struct indexCoordinate* index_getNextCoordinate (  )  [read]

Definition at line 386 of file index.c.

References index_coordinates, index_currentCoordinate, and index_numCoordinates.

Referenced by index_getFirstCoordinate().

00387 {
00388         struct indexCoordinate* coordinate;
00389 
00390     if (index_currentCoordinate >= index_numCoordinates)
00391         return NULL;
00392 
00393     // Get current coordinate and return it
00394         coordinate = index_coordinates + index_currentCoordinate;
00395         index_currentCoordinate++;
00396 
00397     return coordinate;
00398 }

Here is the caller graph for this function:

void index_initializeBuild ( uint4  fromCodeword,
uint4  toCodeword 
)

Definition at line 32 of file index.c.

References wordList::allocated, global_malloc(), index_subjectNumber, index_words, wordList::lastOffset, wordList::lastSequenceNumber, wordList::length, wordList::offsets, and uint4.

Referenced by main().

00033 {
00034         uint4 codeword;
00035 
00036 //      index_numWords = pow(4, index_wordSize);
00037     index_words = (struct wordList*)global_malloc(sizeof(struct wordList) * (toCodeword - fromCodeword));
00038         index_words -= fromCodeword;
00039 
00040     // For each word
00041     codeword = fromCodeword;
00042     while (codeword < toCodeword)
00043     {
00044         // Initialize list of occurrences
00045         index_words[codeword].offsets = NULL;
00046                 index_words[codeword].length = 0;
00047         index_words[codeword].allocated = 0;
00048         index_words[codeword].lastOffset = 0;
00049         index_words[codeword].lastSequenceNumber = 0;
00050 
00051         codeword++;
00052     }
00053 
00054     index_subjectNumber = 0;
00055 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 index_numWordOffsets ( uint4  codeword  ) 

Definition at line 144 of file index.c.

References index_words, and wordList::length.

Referenced by main().

00145 {
00146         return index_words[codeword].length;
00147 }

Here is the caller graph for this function:

void index_print (  ) 

Definition at line 462 of file index.c.

References index_numWords, index_words, wordList::length, wordList::offsets, uint4, and vbyte_getVbyte.

00463 {
00464         uint4 codeword = 0;
00465         struct wordList* wordList;
00466     unsigned char* offsets, *endOffsets;
00467     uint4 offsetGap, offset, numOffsets;
00468         uint4 totalSize = 0;
00469 
00470     while (codeword < index_numWords)
00471     {
00472         numOffsets = 0; offset = 0;
00473 
00474         wordList = index_words + codeword;
00475 
00476                 offsets = wordList->offsets;
00477         endOffsets = offsets + wordList->length;
00478 
00479         totalSize += wordList->length;
00480 
00481         if (offsets < endOffsets)
00482                 printf("\nCodeword=%u:", codeword);
00483 
00484         while (offsets < endOffsets)
00485         {
00486                         vbyte_getVbyte(offsets, (&offsetGap));
00487             offset += offsetGap;
00488             printf(" %u", offset);
00489             numOffsets++;
00490         }
00491 
00492 //      printf("[%d/%d = %f]\n", wordList->length, numOffsets, (float)(wordList->length) / (float)numOffsets);
00493 
00494         codeword++;
00495     }
00496 
00497     printf("\nTotal table size=%d bytes\n", totalSize);
00498 }

void index_processQuery ( unsigned char *  startIndex,
struct PSSMatrix  PSSMatrix,
uint4  collectionSize 
)

Definition at line 161 of file index.c.

References alignments_compareCodeword(), alignments_compareQueryPosition(), PSSMatrix::bestMatchCodes, blast_numHits, queryWord::codeword, encoding_numRegularLetters, queryWord::endOffsets, global_malloc(), index_coordinates, index_descriptionLocations, index_generateCodeword(), index_intervalSize, index_loadedWords, index_numCoordinates, index_numWords, index_offsets, index_sequenceCoordinates, index_sequencePositions, index_wordSize, PSSMatrix::length, memBlocks_free(), memBlocks_getCurrent(), memBlocks_initialize(), memBlocks_newEntry(), memBlocks_resetCurrent(), memBlocks::numTotalEntries, queryWord::offsets, wordList::offsets, PSSMatrix::queryCodes, indexCoordinate::queryOffset, queryWord::queryPosition, PSSMatrix::strandLength, indexCoordinate::subjectNumber, indexCoordinate::subjectOffset, uint4, and vbyte_getVbyte.

00163 {
00164         uint4 queryPosition, codeword = 0, queryPosition4;
00165     unsigned char* offsets, *endOffsets;
00166     uint4 offsetGap, offset, sequenceGap, sequenceNumber;
00167     struct indexCoordinate* coordinate;
00168         struct memBlocks* unsortedCoordinates;
00169     uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets;
00170     uint4 time, wordPosition, containsWildcard;
00171         struct queryWord* queryWords;
00172 
00173     // Read word and interval size from start of index
00174         vbyte_getVbyte(startIndex, &index_wordSize);
00175         vbyte_getVbyte(startIndex, &index_intervalSize);
00176 
00177         index_numWords = pow(4, index_wordSize);
00178     index_sequencePositions = (uint4*)startIndex;
00179     index_descriptionLocations = index_sequencePositions + numSequences;
00180         index_loadedWords = index_descriptionLocations + numSequences;
00181         index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1);
00182 
00183     time = clock();
00184     unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences);
00185 
00186     // Declare and initialize array for count number of hits for each sequence
00187     numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences);
00188         sequenceNumber = 0;
00189     while (sequenceNumber < numSequences)
00190     {
00191         numSubjectHits[sequenceNumber] = 0;
00192         sequenceNumber++;
00193     }
00194 
00195     // Memory to hold offsets string for each query word
00196     numQueryPositions = PSSMatrix.length - index_wordSize + 1;
00197         queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions);
00198 
00199     // For each word in the query
00200     queryPosition = 0;
00201     while (queryPosition < numQueryPositions)
00202     {
00203         // Check if the word contains a wildcard
00204         containsWildcard = 0; wordPosition = 0;
00205         while (wordPosition < index_wordSize)
00206         {
00207             if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters)
00208                 containsWildcard = 1;
00209 
00210             wordPosition++;
00211         }
00212 
00213         // Don't include words that cross the strand boundry or contain wildcards
00214         if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength &&
00215               queryPosition >= PSSMatrix.strandLength - index_wordSize + 1))
00216                 {
00217 //            printf("--Query position=%d\n", queryPosition);
00218 
00219             // Get the codeword
00220             codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize);
00221 
00222             // Get wordlist for that codeword
00223             offsets = index_offsets + index_loadedWords[codeword];
00224             endOffsets = index_offsets + index_loadedWords[codeword + 1];
00225 
00226             queryWords[queryPosition].offsets = offsets;
00227                         queryWords[queryPosition].endOffsets = endOffsets;
00228                         queryWords[queryPosition].queryPosition = queryPosition;
00229             queryWords[queryPosition].codeword = codeword;
00230 
00231 //            printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword],
00232 //                   index_loadedWords[codeword + 1], endOffsets - offsets);
00233                 }
00234         else
00235         {
00236             queryWords[queryPosition].offsets = NULL;
00237                         queryWords[queryPosition].endOffsets = NULL;
00238                         queryWords[queryPosition].queryPosition = queryPosition;
00239             queryWords[queryPosition].codeword = codeword;
00240         }
00241 
00242 //        printf("\n");
00243         queryPosition++;
00244     }
00245 
00246     // Sort the query words by codeword
00247         qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword);
00248 
00249     // For each query word
00250     queryWordCount = 0;
00251     while (queryWordCount < numQueryPositions)
00252     {
00253         // Ignoring those that cross the strand boundry
00254                 if (queryWords[queryWordCount].offsets != NULL)
00255         {
00256                 // Make in-memory copy of list of offsets
00257             numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets;
00258                         offsets = (char*)global_malloc(sizeof(char) * numOffsets);
00259 
00260             memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets);
00261                         queryWords[queryWordCount].offsets = offsets;
00262             queryWords[queryWordCount].endOffsets = offsets + numOffsets;
00263                 }
00264 
00265         queryWordCount++;
00266     }
00267 
00268     // Sort the query words by query position
00269         qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition);
00270 
00271     queryPosition = 0;
00272     while (queryPosition < numQueryPositions)
00273     {
00274         // Ignoring those that cross the strand boundry
00275                 if (queryWords[queryPosition].offsets != NULL)
00276         {
00277                 offsets = queryWords[queryPosition].offsets;
00278             endOffsets = queryWords[queryPosition].endOffsets;
00279             offset = 0;
00280             sequenceNumber = 0;
00281                 queryPosition4 = queryPosition + (index_wordSize - 4);
00282 
00283             // Traverse the offsets
00284             while (offsets < endOffsets)
00285             {
00286                 vbyte_getVbyte(offsets, (&sequenceGap));
00287                 vbyte_getVbyte(offsets, (&offsetGap));
00288 
00289 //                printf("[%d,%d]\n", sequenceGap, offsetGap);
00290 
00291                 if (sequenceGap > 0)
00292                 {
00293                         offset = offsetGap;
00294                     sequenceNumber += sequenceGap;
00295                 }
00296                 else
00297                 {
00298                         offset += offsetGap;
00299                 }
00300     //            printf(" %u", offset);
00301 
00302                 // Add query/database coordinate of match to relevant bucket
00303 //                printf("Sequence number=%d\n", sequenceNumber);
00304                 coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates);
00305                 coordinate->queryOffset = queryPosition4;
00306                 coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4);
00307                 coordinate->subjectNumber = sequenceNumber;
00308 
00309                 numSubjectHits[sequenceNumber]++;
00310 //                printf("[%d,%d]\n", queryPosition, offset);
00311 
00312                 blast_numHits++;
00313             }
00314 
00315             free(queryWords[queryPosition].offsets);
00316                 }
00317 
00318         queryPosition++;
00319         }
00320 
00321 
00322     printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
00323     time = clock();
00324 
00325     // Make memory for sorted list
00326     index_numCoordinates = unsortedCoordinates->numTotalEntries;
00327         index_coordinates = (struct indexCoordinate*)global_malloc(
00328                          sizeof(struct indexCoordinate) * index_numCoordinates);
00329         index_sequenceCoordinates = (struct indexCoordinate**)global_malloc(
00330                                  sizeof(struct indexCoordinate*) * numSequences);
00331 
00332     // For each sequence
00333         coordinate = index_coordinates;
00334     sequenceNumber = 0;
00335     while (sequenceNumber < numSequences)
00336     {
00337         // If it has hits
00338         if (numSubjectHits[sequenceNumber] != 0)
00339         {
00340                 // Point to location in sorted list of coordinates
00341                         index_sequenceCoordinates[sequenceNumber] = coordinate;
00342             coordinate += numSubjectHits[sequenceNumber];
00343 
00344             numSubjectHits[sequenceNumber] = 0;
00345         }
00346         sequenceNumber++;
00347     }
00348 
00349     // Move through list of unsorted coordinates
00350     memBlocks_resetCurrent(unsortedCoordinates);
00351     while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL)
00352     {
00353         sequenceNumber = coordinate->subjectNumber;
00354 //      printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber);
00355         // Place into sorted list
00356                 index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate;
00357                 numSubjectHits[sequenceNumber]++;
00358     }
00359 
00360     memBlocks_free(unsortedCoordinates);
00361 
00362 /*    // Print sorted coordinates
00363         coordinate = index_coordinates;
00364     while (coordinate < index_coordinates + index_numCoordinates)
00365     {
00366         printf("[%d]", coordinate);
00367         printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset,
00368                                             coordinate->subjectOffset);
00369         coordinate++;
00370     }*/
00371 
00372     printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
00373 }

Here is the call graph for this function:

uint4* index_wordOffsetPositions (  ) 

unsigned char* index_wordOffsets ( uint4  codeword  ) 

Definition at line 150 of file index.c.

References index_words, and wordList::offsets.

Referenced by main().

00151 {
00152         return index_words[codeword].offsets;
00153 }

Here is the caller graph for this function:


Variable Documentation

struct indexCoordinate* index_coordinates

Definition at line 39 of file index.h.

Referenced by index_getNextCoordinate(), and index_processQuery().

uint4* index_descriptionLocations

Definition at line 43 of file index.h.

Referenced by index_processQuery().

uint4 index_intervalSize

Definition at line 19 of file index.c.

Referenced by index_addSubject(), index_processQuery(), and main().

uint4 index_numCoordinates

Definition at line 40 of file index.h.

Referenced by index_getNextCoordinate(), and index_processQuery().

struct indexCoordinate** index_sequenceCoordinates

Definition at line 38 of file index.h.

Referenced by index_processQuery().

uint4* index_sequencePositions

Definition at line 42 of file index.h.

Referenced by index_processQuery().

uint4 index_wordSize

Definition at line 19 of file index.c.

Referenced by index_addSubject(), index_processQuery(), and main().


Generated on Wed Dec 19 20:49:43 2007 for fsa-blast by  doxygen 1.5.2