#include "blast.h"#include <errno.h>#include <stdio.h>Include dependency graph for createindex.c:

Go to the source code of this file.
Functions | |
| int4 | main (int4 argc, char *argv[]) |
| int4 main | ( | int4 | argc, | |
| char * | argv[] | |||
| ) |
Definition at line 12 of file createindex.c.
References readFile::address, encoding_alphabetType, encoding_byteUnpack(), encoding_initialize(), encoding_nucleotide, readFile::fileSize, global_malloc(), index_addSubject(), index_finishBuild(), index_initializeBuild(), index_intervalSize, index_numWordOffsets(), index_wordOffsets(), index_wordSize, readFile_close(), readFile_open(), uint4, uint8, vbyte_get64vbyte(), vbyte_getVbyte, and vbyte_putVbyte.
00013 { 00014 unsigned char *filename, *address, *sequence, code, *startAddress, remainder; 00015 uint4 numberOfSequences, longestSequenceLength, dbAlphabetType; 00016 uint8 numberOfLetters; 00017 uint4 descriptionStart = 0, descriptionLength = 0, sequenceSize, count; 00018 uint4 fileSize, sequenceCount = 0, encodedLength; 00019 uint4 allocSize = 1024; 00020 uint4* int4Address; 00021 char* sequenceFilename, *indexFilename; 00022 struct readFile readFile; 00023 uint4 *sequencePositions, *descriptionLocations; 00024 uint4 descriptionFileSize = 0; 00025 00026 uint4 numWords, codeword, numOffsets, indexSize = 0; 00027 unsigned char* offsets, *vbyteEncoded, startVbyteEncoded[4]; 00028 FILE *indexFile; 00029 unsigned char headerData[40], *headerDataPointer; 00030 uint4 headerSize, positionBlockOffset = 0; 00031 uint4* listPositions, currentPosition = 0; 00032 uint4 fromCodeword, toCodeword, codewordRangeSize; 00033 00034 // User must provide FASTA format file at command line 00035 if (argc != 2 && argc != 4) 00036 { 00037 fprintf(stderr, "Useage: createindex <Database> [Word size] [Interval size]\n"); 00038 exit(-1); 00039 } 00040 filename = argv[1]; 00041 00042 // Optionally user provides word and interval size 00043 if (argc == 4) 00044 { 00045 index_wordSize = atoi(argv[2]); 00046 index_intervalSize = atoi(argv[3]); 00047 } 00048 00049 // Allocation initial buffer to hold sequences 00050 sequence = (char*)global_malloc(allocSize); 00051 00052 // Construct sequence filename 00053 sequenceFilename = (char*)global_malloc(strlen(filename) + 13); 00054 sprintf(sequenceFilename, "%s.sequences", filename); 00055 00056 // Open sequence file for reading, mapping contents to address 00057 readFile = readFile_open(sequenceFilename); 00058 startAddress = address = (char*)readFile.address; 00059 00060 // Get the file size 00061 fileSize = readFile.fileSize; 00062 00063 // Read database statistics (First 40 bytes contains 4 vbytes) 00064 vbyte_getVbyte(address, &numberOfSequences); 00065 address = vbyte_get64vbyte(address, &numberOfLetters); 00066 vbyte_getVbyte(address, &longestSequenceLength); 00067 vbyte_getVbyte(address, &dbAlphabetType); 00068 00069 // Advance to start of sequences 00070 address = startAddress + 40; 00071 00072 // Initialize list of sequence and description positions 00073 sequencePositions = (uint4*)malloc(sizeof(uint4) * numberOfSequences); 00074 descriptionLocations = (uint4*)malloc(sizeof(uint4) * numberOfSequences); 00075 00076 // Initialize codes array 00077 encoding_initialize(dbAlphabetType); 00078 00079 printf("Processing database..."); 00080 fflush(stdout); 00081 00082 // Move through sequence file reading sequence codes and converting back to ASCII sequences 00083 while (address < startAddress + fileSize) 00084 { 00085 // Record start position of sequence 00086 sequencePositions[sequenceCount] = address - startAddress; 00087 00088 // Read two vbytes, the description location in the FASTA file and the length of the sequence 00089 descriptionStart += descriptionLength; 00090 vbyte_getVbyte(address, &descriptionLength); 00091 vbyte_getVbyte(address, &sequenceSize); 00092 00093 // Record location of description 00094 descriptionLocations[sequenceCount] = descriptionStart; 00095 00096 // Nucleotide sequences 00097 if (encoding_alphabetType == encoding_nucleotide) 00098 { 00099 // Read a third vbyte; the total length of sequence data including wildcard info 00100 vbyte_getVbyte(address, &encodedLength); 00101 00102 // Skip to end of the sequence 00103 address += encodedLength; 00104 } 00105 // Protein sequences 00106 else 00107 { 00108 // Skip past sentinal byte 00109 address++; 00110 00111 sequence = address; 00112 address += sequenceSize; 00113 00114 // Skip past sentinal byte 00115 address++; 00116 } 00117 00118 sequenceCount++; 00119 00120 // Print status dots 00121 if (sequenceCount % 10000 == 0) 00122 { 00123 printf("."); 00124 fflush(stdout); 00125 } 00126 } 00127 00128 // Check we ended at the end of the file 00129 if (address != startAddress + fileSize) 00130 { 00131 fprintf(stderr, "Error: Premature end of sequence file\n"); 00132 exit(-1); 00133 } 00134 00135 printf("done.\n"); 00136 printf("%d sequences read.\n", sequenceCount); 00137 fflush(stdout); 00138 00139 // Create the index 00140 printf("Creating index..."); 00141 fflush(stdout); 00142 00143 indexFilename = (char*)global_malloc(strlen(filename) + 10); 00144 sprintf(indexFilename, "%s.index", filename); 00145 00146 // Open index file for writing 00147 if ((indexFile = fopen(indexFilename, "w")) == NULL) 00148 { 00149 fprintf(stderr, "Error opening file %s for writing\n", indexFilename); 00150 exit(-1); 00151 } 00152 00153 // Write word size and interval size at start of index 00154 headerDataPointer = headerData; 00155 vbyte_putVbyte(headerDataPointer, index_wordSize); 00156 vbyte_putVbyte(headerDataPointer, index_intervalSize); 00157 headerSize = headerDataPointer - headerData; 00158 if (fwrite(&headerData, sizeof(unsigned char), headerSize, indexFile) < headerSize) 00159 { 00160 fprintf(stderr, "Error writing header to index file %s\n", indexFilename); 00161 exit(-1); 00162 } 00163 00164 // Write sequence positions 00165 if (fwrite(sequencePositions, sizeof(uint4), numberOfSequences, indexFile) < numberOfSequences) 00166 { 00167 fprintf(stderr, "Error writing sequence positions to index file %s\n", indexFilename); 00168 fprintf(stderr, strerror(errno)); 00169 fprintf(stderr, "\n"); fflush(stderr); 00170 exit(-1); 00171 } 00172 00173 // Write description locations 00174 if (fwrite(descriptionLocations, sizeof(uint4), numberOfSequences, indexFile) < numberOfSequences) 00175 { 00176 fprintf(stderr, "Error writing description locations to index file %s\n", indexFilename); 00177 fprintf(stderr, strerror(errno)); 00178 fprintf(stderr, "\n"); fflush(stderr); 00179 exit(-1); 00180 } 00181 00182 // Write padding for list positions 00183 numWords = pow(4, index_wordSize); 00184 listPositions = (uint4*)global_malloc(sizeof(uint4) * (numWords + 1)); 00185 codeword = 0; 00186 while (codeword < numWords + 1) 00187 { 00188 listPositions[codeword] = 0; 00189 codeword++; 00190 } 00191 00192 if (fwrite(listPositions, sizeof(uint4), numWords + 1, indexFile) < numWords + 1) 00193 { 00194 fprintf(stderr, "Error writing word offset position padding to index file %s\n", indexFilename); 00195 fprintf(stderr, strerror(errno)); 00196 fprintf(stderr, "\n"); fflush(stderr); 00197 exit(-1); 00198 } 00199 00200 // SECOND PASS: build inverted index lists 00201 00202 // Determine the number of words to process each pass through the collection 00203 codewordRangeSize = (300000000 / (sizeof(struct wordList) + 00204 (6.0 * (float)numberOfLetters / (float)numWords / (float)index_intervalSize))); 00205 printf("codewordRangeSize=%d\n", codewordRangeSize); 00206 00207 // Pass through the collection considering words in the range between fromCodeword and toCodeword 00208 fromCodeword = 0; 00209 toCodeword = 0; 00210 while (toCodeword < numWords) 00211 { 00212 // Set fromCodeword and toCodeword 00213 fromCodeword = toCodeword; 00214 toCodeword += codewordRangeSize; 00215 if (toCodeword > numWords) 00216 toCodeword = numWords; 00217 00218 printf("[%d,%d]/%d\n", fromCodeword, toCodeword, numWords); 00219 00220 // Initialize index build 00221 index_initializeBuild(fromCodeword, toCodeword); 00222 00223 // Advance to start of sequences 00224 address = startAddress + 40; 00225 00226 // Move through sequence file reading sequences 00227 while (address < startAddress + fileSize) 00228 { 00229 // Read two vbytes, the description location in the FASTA file and the length of the sequence 00230 descriptionStart += descriptionLength; 00231 vbyte_getVbyte(address, &descriptionLength); 00232 vbyte_getVbyte(address, &sequenceSize); 00233 00234 // Nucleotide sequences 00235 if (encoding_alphabetType == encoding_nucleotide) 00236 { 00237 // Read a third vbyte; the total length of sequence data including wildcard info 00238 vbyte_getVbyte(address, &encodedLength); 00239 00240 // Unpack the sequence 00241 sequence = encoding_byteUnpack(address, sequenceSize); 00242 00243 // Skip to end of the sequence 00244 address += encodedLength; 00245 } 00246 // Protein sequences 00247 else 00248 { 00249 // Skip past sentinal byte 00250 address++; 00251 00252 sequence = address; 00253 address += sequenceSize; 00254 00255 // Skip past sentinal byte 00256 address++; 00257 } 00258 00259 // Add the subject to the index structure 00260 index_addSubject(sequence, sequenceSize, fromCodeword, toCodeword); 00261 00262 if (encoding_alphabetType == encoding_nucleotide) 00263 free(sequence); 00264 00265 sequenceCount++; 00266 00267 // Print status dots 00268 if (sequenceCount % 10000 == 0) 00269 { 00270 printf("."); 00271 fflush(stdout); 00272 } 00273 } 00274 00275 // Check we ended at the end of the file 00276 if (address != startAddress + fileSize) 00277 { 00278 fprintf(stderr, "Error: Premature end of sequence file\n"); 00279 exit(-1); 00280 } 00281 00282 // Get word list positions and number of words 00283 // wordOffsetPositions = index_wordOffsetPositions(fromCodeword, toCodeword); 00284 // numWords = toCodeword - fromCodeword; 00285 00286 // For each word, write list of offset-gaps to END of file 00287 codeword = fromCodeword; 00288 while (codeword < toCodeword) 00289 { 00290 // Write the list of offsets 00291 numOffsets = index_numWordOffsets(codeword); 00292 offsets = index_wordOffsets(codeword); 00293 00294 // Record position of offsets 00295 listPositions[codeword] = currentPosition; 00296 currentPosition += numOffsets; 00297 00298 if (numOffsets > 0) 00299 { 00300 /* int a = 0, b = 0; 00301 while (a < numOffsets) 00302 { 00303 b += offsets[a]; 00304 a++; 00305 } 00306 fprintf(stderr, "[%p][%d][%p][%d]\n", offsets, numOffsets, indexFile, b); fflush(stderr); 00307 */ 00308 // if (fwrite("abc", sizeof(unsigned char), 3, indexFile) < 3) 00309 if (fwrite(offsets, sizeof(unsigned char), numOffsets, indexFile) < numOffsets) 00310 { 00311 fprintf(stderr, "Error writing word offsets to index file %s\n", indexFilename); 00312 fprintf(stderr, strerror(errno)); 00313 fprintf(stderr, "\n"); fflush(stderr); 00314 exit(-1); 00315 } 00316 } 00317 00318 indexSize += numOffsets; 00319 codeword++; 00320 } 00321 00322 // Finishing building inverted lists for the given range of words 00323 index_finishBuild(fromCodeword, toCodeword); 00324 } 00325 00326 readFile_close(readFile); 00327 00328 // Record extra list position 00329 listPositions[toCodeword] = currentPosition; 00330 00331 // Close writing to index file 00332 fclose(indexFile); 00333 00334 // Reopen and write list positions 00335 if ((indexFile = fopen(indexFilename, "r+")) == NULL) 00336 { 00337 fprintf(stderr, "Error opening file %s for writing\n", indexFilename); 00338 exit(-1); 00339 } 00340 00341 // Jump to position of interest 00342 fseek(indexFile, numberOfSequences * sizeof(uint4) * 2 + headerSize, SEEK_SET); 00343 00344 // Write list positions to index file 00345 if (fwrite(listPositions, sizeof(uint4), numWords + 1, indexFile) < numWords + 1) 00346 { 00347 fprintf(stderr, "Error writing word offset positions to index file %s\n", indexFilename); 00348 fprintf(stderr, strerror(errno)); 00349 fprintf(stderr, "\n"); fflush(stderr); 00350 exit(-1); 00351 } 00352 00353 // Close writing to index file 00354 fclose(indexFile); 00355 00356 printf("done.\n"); 00357 00358 printf("Size of index = %dMb + %dMb + %dMb + %dMb = ", numberOfSequences * 4 / 1048576, 00359 numberOfSequences * 4 / 1048576, ((numWords + 1) * 4) / 1048576, 00360 indexSize / 1048576); 00361 00362 indexSize += (numWords + 1) * 4 + numberOfSequences * 4 + numberOfSequences * 4; 00363 printf("%dMb\n", indexSize / 1048576); 00364 00365 return 0; 00366 }
Here is the call graph for this function:

1.5.2