src/formatdb.c File Reference

#include <blast.h>

Include dependency graph for formatdb.c:

Go to the source code of this file.

Functions

uint4 determineDbAlphabetType (char *filename)
int4 main (int argc, char *argv[])


Function Documentation

uint4 determineDbAlphabetType ( char *  filename  ) 

Definition at line 128 of file formatdb.c.

References encoding_determineAlphabetType(), encoding_nucleotide, encoding_protein, int4, readFasta_close(), readFasta_open(), readFasta_readSequence(), readFasta_sequenceBuffer, readFasta_sequenceLength, and uint4.

Referenced by main().

00129 {
00130         int4 sequenceCount = 0;
00131     char* sequence;
00132     uint4 sequenceLength;
00133 
00134     // Open FASTA file for reading
00135         readFasta_open(filename);
00136 
00137         // Move through the FASTA file reading descriptions and sequences
00138         while (readFasta_readSequence() && sequenceCount < 10)
00139         {
00140                 // Get sequence just read
00141                 sequence = readFasta_sequenceBuffer;
00142                 sequenceLength = readFasta_sequenceLength;
00143 
00144         // Determine the alphabet of the current sequence
00145         if (encoding_determineAlphabetType(sequence, sequenceLength) == encoding_protein)
00146                 {
00147                 // If contains protein letters, return protein type
00148             readFasta_close();
00149                         return encoding_protein;
00150         }
00151 
00152         sequenceCount++;
00153         }
00154 
00155         // Close fasta reader and return nucleotide type
00156         readFasta_close();
00157     return encoding_nucleotide;
00158 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 main ( int  argc,
char *  argv[] 
)

Definition at line 12 of file formatdb.c.

References determineDbAlphabetType(), encoding_encodeSequence(), encoding_initialize(), encoding_nucleotide, encoding_protein, encoding_replaceWildcards(), global_realloc(), int4, memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, readFasta_close(), readFasta_descriptionBuffer, readFasta_descriptionLength, readFasta_open(), readFasta_readSequence(), readFasta_sequenceBuffer, readFasta_sequenceLength, uint4, vbyte_putVbyte, writedb_addSequence(), writedb_close(), writedb_initialize(), writedb_maximumSequenceLength, writedb_minimumSequenceLength, writedb_numberOfLetters, writedb_sequenceCount, and writedb_volume.

00013 {
00014         char *sequence, *filename;
00015     uint4 sequenceLength;
00016     int4 totalWilds = 0, alphabetType;
00017     struct memSingleBlock* wildcardEdits;
00018     struct wildcardEdit* wildcardEdit;
00019     char *wildcardData = NULL, *startWildcardData = NULL;
00020 
00021         // User must provide FASTA format file at command line
00022         if (argc < 2)
00023         {
00024                 fprintf(stderr, "Useage: formatdb <FASTA file>\n");
00025                 exit(-1);
00026         }
00027         filename = argv[1];
00028 
00029     // Initialize array to store wildcard edits
00030     wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10);
00031 
00032     // Determine if database is protein or nucleotide
00033         alphabetType = determineDbAlphabetType(filename);
00034 
00035     if (alphabetType == encoding_protein)
00036     {
00037         printf("PROTEIN database detected.\n");
00038     }
00039     else if (alphabetType == encoding_nucleotide)
00040     {
00041         printf("NUCLEOTIDE database detected.\n");
00042     }
00043 
00044         // Initialize codes array
00045         encoding_initialize(alphabetType);
00046 
00047         // Initialize writing to formatted database
00048     writedb_initialize(filename, alphabetType);
00049 
00050     // Open FASTA file for reading
00051         readFasta_open(filename);
00052 
00053         printf("Formatting database...");
00054         fflush(stdout);
00055 
00056         // Move through the FASTA file reading descriptions and sequences
00057         while (readFasta_readSequence())
00058         {
00059                 // Get sequence just read
00060                 sequence = readFasta_sequenceBuffer;
00061                 sequenceLength = readFasta_sequenceLength;
00062 
00063         // Encode the sequence
00064                 encoding_encodeSequence(sequence, sequenceLength, alphabetType);
00065 
00066         // Convert nucleotide sequences to byte-packed format
00067         if (alphabetType == encoding_nucleotide)
00068         {
00069                         // Replace any wilds with a random character
00070             totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength);
00071 
00072                         // Declare memory to hold wildcard data
00073             startWildcardData = global_realloc(startWildcardData,
00074                                 sizeof(char) * wildcardEdits->numEntries * 5);
00075             wildcardData = startWildcardData;
00076 
00077             // For each wildcard edit, encode details using chars and vbytes
00078             memSingleBlock_resetCurrent(wildcardEdits);
00079             while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL)
00080             {
00081                 // Record wild character
00082                 *wildcardData = wildcardEdit->code;
00083                 wildcardData++;
00084 
00085                 // Convert the position to a vbyte
00086                 vbyte_putVbyte(wildcardData, wildcardEdit->position);
00087             }
00088                 }
00089         else
00090         {
00091                 startWildcardData = wildcardData = NULL;
00092                 }
00093 
00094 //        printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout);
00095 
00096         // Add sequence to the formatted collection
00097         writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer,
00098                             readFasta_descriptionLength, startWildcardData,
00099                             wildcardData - startWildcardData, NULL, 0);
00100 
00101                 // Print status dots
00102                 if (writedb_sequenceCount % 10000 == 0)
00103                 {
00104                         printf(".");
00105                         fflush(stdout);
00106                 }
00107         }
00108 
00109         // Close fasta reader
00110         readFasta_close();
00111 
00112     // Finalize writing to the formatted collection
00113     writedb_close();
00114 
00115         printf("done.\n");
00116         printf("%d sequences processed.\n", writedb_sequenceCount);
00117         printf("%llu letters processed.\n", writedb_numberOfLetters);
00118     printf("%d wildcards encoded.\n", totalWilds);
00119         printf("%d volume(s) created.\n", writedb_volume + 1);
00120         printf("Longest/shortest sequence was %d/%d letters\n",
00121            writedb_maximumSequenceLength, writedb_minimumSequenceLength);
00122         fflush(stdout);
00123 
00124         return 0;
00125 }

Here is the call graph for this function:


Generated on Wed Dec 19 20:55:50 2007 for fsa-blast by  doxygen 1.5.2