src/createindex.c File Reference

#include "blast.h"
#include <errno.h>
#include <stdio.h>

Include dependency graph for createindex.c:

Go to the source code of this file.

Functions

int4 main (int4 argc, char *argv[])


Function Documentation

int4 main ( int4  argc,
char *  argv[] 
)

Definition at line 12 of file createindex.c.

References readFile::address, encoding_alphabetType, encoding_byteUnpack(), encoding_initialize(), encoding_nucleotide, readFile::fileSize, global_malloc(), index_addSubject(), index_finishBuild(), index_initializeBuild(), index_intervalSize, index_numWordOffsets(), index_wordOffsets(), index_wordSize, readFile_close(), readFile_open(), uint4, uint8, vbyte_get64vbyte(), vbyte_getVbyte, and vbyte_putVbyte.

00013 {
00014         unsigned char *filename, *address, *sequence, code, *startAddress, remainder;
00015         uint4 numberOfSequences, longestSequenceLength, dbAlphabetType;
00016     uint8 numberOfLetters;
00017         uint4 descriptionStart = 0, descriptionLength = 0, sequenceSize, count;
00018         uint4 fileSize, sequenceCount = 0, encodedLength;
00019         uint4 allocSize = 1024;
00020         uint4* int4Address;
00021         char* sequenceFilename, *indexFilename;
00022         struct readFile readFile;
00023     uint4 *sequencePositions, *descriptionLocations;
00024     uint4 descriptionFileSize = 0;
00025 
00026     uint4 numWords, codeword, numOffsets, indexSize = 0;
00027         unsigned char* offsets, *vbyteEncoded, startVbyteEncoded[4];
00028         FILE *indexFile;
00029         unsigned char headerData[40], *headerDataPointer;
00030     uint4 headerSize, positionBlockOffset = 0;
00031         uint4* listPositions, currentPosition = 0;
00032         uint4 fromCodeword, toCodeword, codewordRangeSize;
00033 
00034     // User must provide FASTA format file at command line
00035         if (argc != 2 && argc != 4)
00036         {
00037                 fprintf(stderr, "Useage: createindex <Database> [Word size] [Interval size]\n");
00038                 exit(-1);
00039         }
00040         filename = argv[1];
00041 
00042     // Optionally user provides word and interval size
00043     if (argc == 4)
00044     {
00045         index_wordSize = atoi(argv[2]);
00046         index_intervalSize = atoi(argv[3]);
00047     }
00048 
00049         // Allocation initial buffer to hold sequences
00050         sequence = (char*)global_malloc(allocSize);
00051 
00052         // Construct sequence filename
00053         sequenceFilename = (char*)global_malloc(strlen(filename) + 13);
00054         sprintf(sequenceFilename, "%s.sequences", filename);
00055 
00056         // Open sequence file for reading, mapping contents to address
00057         readFile = readFile_open(sequenceFilename);
00058         startAddress = address = (char*)readFile.address;
00059 
00060     // Get the file size
00061         fileSize = readFile.fileSize;
00062 
00063         // Read database statistics (First 40 bytes contains 4 vbytes)
00064     vbyte_getVbyte(address, &numberOfSequences);
00065     address = vbyte_get64vbyte(address, &numberOfLetters);
00066     vbyte_getVbyte(address, &longestSequenceLength);
00067     vbyte_getVbyte(address, &dbAlphabetType);
00068 
00069     // Advance to start of sequences
00070     address = startAddress + 40;
00071 
00072     // Initialize list of sequence and description positions
00073     sequencePositions = (uint4*)malloc(sizeof(uint4) * numberOfSequences);
00074     descriptionLocations = (uint4*)malloc(sizeof(uint4) * numberOfSequences);
00075 
00076     // Initialize codes array
00077         encoding_initialize(dbAlphabetType);
00078 
00079     printf("Processing database...");
00080         fflush(stdout);
00081 
00082     // Move through sequence file reading sequence codes and converting back to ASCII sequences
00083         while (address < startAddress + fileSize)
00084         {
00085         // Record start position of sequence
00086                 sequencePositions[sequenceCount] = address - startAddress;
00087 
00088         // Read two vbytes, the description location in the FASTA file and the length of the sequence
00089         descriptionStart += descriptionLength;
00090         vbyte_getVbyte(address, &descriptionLength);
00091         vbyte_getVbyte(address, &sequenceSize);
00092 
00093         // Record location of description
00094         descriptionLocations[sequenceCount] = descriptionStart;
00095 
00096         // Nucleotide sequences
00097         if (encoding_alphabetType == encoding_nucleotide)
00098         {
00099             // Read a third vbyte; the total length of sequence data including wildcard info
00100             vbyte_getVbyte(address, &encodedLength);
00101 
00102             // Skip to end of the sequence
00103             address += encodedLength;
00104         }
00105         // Protein sequences
00106         else
00107         {
00108             // Skip past sentinal byte
00109             address++;
00110 
00111             sequence = address;
00112             address += sequenceSize;
00113 
00114             // Skip past sentinal byte
00115             address++;
00116                 }
00117 
00118                 sequenceCount++;
00119 
00120         // Print status dots
00121                 if (sequenceCount % 10000 == 0)
00122                 {
00123                         printf(".");
00124                         fflush(stdout);
00125                 }
00126         }
00127 
00128         // Check we ended at the end of the file
00129         if (address != startAddress + fileSize)
00130         {
00131                 fprintf(stderr, "Error: Premature end of sequence file\n");
00132                 exit(-1);
00133         }
00134 
00135         printf("done.\n");
00136         printf("%d sequences read.\n", sequenceCount);
00137         fflush(stdout);
00138 
00139     // Create the index
00140         printf("Creating index...");
00141         fflush(stdout);
00142 
00143     indexFilename = (char*)global_malloc(strlen(filename) + 10);
00144         sprintf(indexFilename, "%s.index", filename);
00145 
00146         // Open index file for writing
00147         if ((indexFile = fopen(indexFilename, "w")) == NULL)
00148         {
00149                 fprintf(stderr, "Error opening file %s for writing\n", indexFilename);
00150                 exit(-1);
00151         }
00152 
00153     // Write word size and interval size at start of index
00154     headerDataPointer = headerData;
00155     vbyte_putVbyte(headerDataPointer, index_wordSize);
00156     vbyte_putVbyte(headerDataPointer, index_intervalSize);
00157     headerSize = headerDataPointer - headerData;
00158     if (fwrite(&headerData, sizeof(unsigned char), headerSize, indexFile) < headerSize)
00159         {
00160                 fprintf(stderr, "Error writing header to index file %s\n", indexFilename);
00161                 exit(-1);
00162         }
00163 
00164     // Write sequence positions
00165     if (fwrite(sequencePositions, sizeof(uint4), numberOfSequences, indexFile) < numberOfSequences)
00166     {
00167         fprintf(stderr, "Error writing sequence positions to index file %s\n", indexFilename);
00168         fprintf(stderr, strerror(errno));
00169         fprintf(stderr, "\n"); fflush(stderr);
00170         exit(-1);
00171     }
00172 
00173     // Write description locations
00174     if (fwrite(descriptionLocations, sizeof(uint4), numberOfSequences, indexFile) < numberOfSequences)
00175     {
00176         fprintf(stderr, "Error writing description locations to index file %s\n", indexFilename);
00177         fprintf(stderr, strerror(errno));
00178         fprintf(stderr, "\n"); fflush(stderr);
00179         exit(-1);
00180     }
00181 
00182     // Write padding for list positions
00183     numWords = pow(4, index_wordSize);
00184         listPositions = (uint4*)global_malloc(sizeof(uint4) * (numWords + 1));
00185         codeword = 0;
00186     while (codeword < numWords + 1)
00187     {
00188                 listPositions[codeword] = 0;
00189         codeword++;
00190         }
00191 
00192     if (fwrite(listPositions, sizeof(uint4), numWords + 1, indexFile) < numWords + 1)
00193     {
00194         fprintf(stderr, "Error writing word offset position padding to index file %s\n", indexFilename);
00195         fprintf(stderr, strerror(errno));
00196         fprintf(stderr, "\n"); fflush(stderr);
00197         exit(-1);
00198     }
00199 
00200     // SECOND PASS: build inverted index lists
00201 
00202     // Determine the number of words to process each pass through the collection
00203     codewordRangeSize = (300000000 / (sizeof(struct wordList) +
00204                         (6.0 * (float)numberOfLetters / (float)numWords / (float)index_intervalSize)));
00205         printf("codewordRangeSize=%d\n", codewordRangeSize);
00206 
00207         // Pass through the collection considering words in the range between fromCodeword and toCodeword
00208     fromCodeword = 0;
00209     toCodeword = 0;
00210     while (toCodeword < numWords)
00211     {
00212         // Set fromCodeword and toCodeword
00213                 fromCodeword = toCodeword;
00214         toCodeword += codewordRangeSize;
00215         if (toCodeword > numWords)
00216                 toCodeword = numWords;
00217 
00218                 printf("[%d,%d]/%d\n", fromCodeword, toCodeword, numWords);
00219 
00220         // Initialize index build
00221         index_initializeBuild(fromCodeword, toCodeword);
00222 
00223         // Advance to start of sequences
00224         address = startAddress + 40;
00225 
00226         // Move through sequence file reading sequences
00227         while (address < startAddress + fileSize)
00228         {
00229             // Read two vbytes, the description location in the FASTA file and the length of the sequence
00230             descriptionStart += descriptionLength;
00231             vbyte_getVbyte(address, &descriptionLength);
00232             vbyte_getVbyte(address, &sequenceSize);
00233 
00234             // Nucleotide sequences
00235             if (encoding_alphabetType == encoding_nucleotide)
00236             {
00237                 // Read a third vbyte; the total length of sequence data including wildcard info
00238                 vbyte_getVbyte(address, &encodedLength);
00239 
00240                 // Unpack the sequence
00241                 sequence = encoding_byteUnpack(address, sequenceSize);
00242 
00243                 // Skip to end of the sequence
00244                 address += encodedLength;
00245             }
00246             // Protein sequences
00247             else
00248             {
00249                 // Skip past sentinal byte
00250                 address++;
00251 
00252                 sequence = address;
00253                 address += sequenceSize;
00254 
00255                 // Skip past sentinal byte
00256                 address++;
00257             }
00258 
00259             // Add the subject to the index structure
00260             index_addSubject(sequence, sequenceSize, fromCodeword, toCodeword);
00261 
00262             if (encoding_alphabetType == encoding_nucleotide)
00263                 free(sequence);
00264 
00265             sequenceCount++;
00266 
00267             // Print status dots
00268             if (sequenceCount % 10000 == 0)
00269             {
00270                 printf(".");
00271                 fflush(stdout);
00272             }
00273         }
00274 
00275         // Check we ended at the end of the file
00276         if (address != startAddress + fileSize)
00277         {
00278             fprintf(stderr, "Error: Premature end of sequence file\n");
00279             exit(-1);
00280         }
00281 
00282         // Get word list positions and number of words
00283 //        wordOffsetPositions = index_wordOffsetPositions(fromCodeword, toCodeword);
00284 //        numWords = toCodeword - fromCodeword;
00285 
00286         // For each word, write list of offset-gaps to END of file
00287         codeword = fromCodeword;
00288         while (codeword < toCodeword)
00289         {
00290             // Write the list of offsets
00291             numOffsets = index_numWordOffsets(codeword);
00292             offsets = index_wordOffsets(codeword);
00293 
00294             // Record position of offsets
00295                         listPositions[codeword] = currentPosition;
00296             currentPosition += numOffsets;
00297 
00298             if (numOffsets > 0)
00299             {
00300 /*                int a = 0, b = 0;
00301                 while (a < numOffsets)
00302                 {
00303                     b += offsets[a];
00304                     a++;
00305                 }
00306                 fprintf(stderr, "[%p][%d][%p][%d]\n", offsets, numOffsets, indexFile, b); fflush(stderr);
00307 */
00308 //                if (fwrite("abc", sizeof(unsigned char), 3, indexFile) < 3)
00309                 if (fwrite(offsets, sizeof(unsigned char), numOffsets, indexFile) < numOffsets)
00310                 {
00311                     fprintf(stderr, "Error writing word offsets to index file %s\n", indexFilename);
00312                     fprintf(stderr, strerror(errno));
00313                     fprintf(stderr, "\n"); fflush(stderr);
00314                     exit(-1);
00315                 }
00316                         }
00317 
00318             indexSize += numOffsets;
00319             codeword++;
00320         }
00321 
00322         // Finishing building inverted lists for the given range of words
00323         index_finishBuild(fromCodeword, toCodeword);
00324     }
00325 
00326     readFile_close(readFile);
00327 
00328     // Record extra list position
00329     listPositions[toCodeword] = currentPosition;
00330 
00331     // Close writing to index file
00332     fclose(indexFile);
00333 
00334     // Reopen and write list positions
00335         if ((indexFile = fopen(indexFilename, "r+")) == NULL)
00336         {
00337                 fprintf(stderr, "Error opening file %s for writing\n", indexFilename);
00338                 exit(-1);
00339         }
00340 
00341     // Jump to position of interest
00342     fseek(indexFile, numberOfSequences * sizeof(uint4) * 2 + headerSize, SEEK_SET);
00343 
00344     // Write list positions to index file
00345     if (fwrite(listPositions, sizeof(uint4), numWords + 1, indexFile) < numWords + 1)
00346     {
00347         fprintf(stderr, "Error writing word offset positions to index file %s\n", indexFilename);
00348         fprintf(stderr, strerror(errno));
00349         fprintf(stderr, "\n"); fflush(stderr);
00350         exit(-1);
00351     }
00352 
00353         // Close writing to index file
00354         fclose(indexFile);
00355 
00356     printf("done.\n");
00357 
00358     printf("Size of index = %dMb + %dMb + %dMb + %dMb = ", numberOfSequences * 4 / 1048576,
00359            numberOfSequences * 4 / 1048576, ((numWords + 1) * 4) / 1048576,
00360            indexSize / 1048576);
00361 
00362     indexSize += (numWords + 1) * 4 + numberOfSequences * 4 + numberOfSequences * 4;
00363     printf("%dMb\n", indexSize / 1048576);
00364 
00365         return 0;
00366 }

Here is the call graph for this function:


Generated on Wed Dec 19 20:54:50 2007 for fsa-blast by  doxygen 1.5.2