#include <blast.h>Include dependency graph for formatdb.c:

Go to the source code of this file.
Functions | |
| uint4 | determineDbAlphabetType (char *filename) |
| int4 | main (int argc, char *argv[]) |
| uint4 determineDbAlphabetType | ( | char * | filename | ) |
Definition at line 128 of file formatdb.c.
References encoding_determineAlphabetType(), encoding_nucleotide, encoding_protein, int4, readFasta_close(), readFasta_open(), readFasta_readSequence(), readFasta_sequenceBuffer, readFasta_sequenceLength, and uint4.
Referenced by main().
00129 { 00130 int4 sequenceCount = 0; 00131 char* sequence; 00132 uint4 sequenceLength; 00133 00134 // Open FASTA file for reading 00135 readFasta_open(filename); 00136 00137 // Move through the FASTA file reading descriptions and sequences 00138 while (readFasta_readSequence() && sequenceCount < 10) 00139 { 00140 // Get sequence just read 00141 sequence = readFasta_sequenceBuffer; 00142 sequenceLength = readFasta_sequenceLength; 00143 00144 // Determine the alphabet of the current sequence 00145 if (encoding_determineAlphabetType(sequence, sequenceLength) == encoding_protein) 00146 { 00147 // If contains protein letters, return protein type 00148 readFasta_close(); 00149 return encoding_protein; 00150 } 00151 00152 sequenceCount++; 00153 } 00154 00155 // Close fasta reader and return nucleotide type 00156 readFasta_close(); 00157 return encoding_nucleotide; 00158 }
Here is the call graph for this function:

Here is the caller graph for this function:

| int4 main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
Definition at line 12 of file formatdb.c.
References determineDbAlphabetType(), encoding_encodeSequence(), encoding_initialize(), encoding_nucleotide, encoding_protein, encoding_replaceWildcards(), global_realloc(), int4, memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, readFasta_close(), readFasta_descriptionBuffer, readFasta_descriptionLength, readFasta_open(), readFasta_readSequence(), readFasta_sequenceBuffer, readFasta_sequenceLength, uint4, vbyte_putVbyte, writedb_addSequence(), writedb_close(), writedb_initialize(), writedb_maximumSequenceLength, writedb_minimumSequenceLength, writedb_numberOfLetters, writedb_sequenceCount, and writedb_volume.
00013 { 00014 char *sequence, *filename; 00015 uint4 sequenceLength; 00016 int4 totalWilds = 0, alphabetType; 00017 struct memSingleBlock* wildcardEdits; 00018 struct wildcardEdit* wildcardEdit; 00019 char *wildcardData = NULL, *startWildcardData = NULL; 00020 00021 // User must provide FASTA format file at command line 00022 if (argc < 2) 00023 { 00024 fprintf(stderr, "Useage: formatdb <FASTA file>\n"); 00025 exit(-1); 00026 } 00027 filename = argv[1]; 00028 00029 // Initialize array to store wildcard edits 00030 wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10); 00031 00032 // Determine if database is protein or nucleotide 00033 alphabetType = determineDbAlphabetType(filename); 00034 00035 if (alphabetType == encoding_protein) 00036 { 00037 printf("PROTEIN database detected.\n"); 00038 } 00039 else if (alphabetType == encoding_nucleotide) 00040 { 00041 printf("NUCLEOTIDE database detected.\n"); 00042 } 00043 00044 // Initialize codes array 00045 encoding_initialize(alphabetType); 00046 00047 // Initialize writing to formatted database 00048 writedb_initialize(filename, alphabetType); 00049 00050 // Open FASTA file for reading 00051 readFasta_open(filename); 00052 00053 printf("Formatting database..."); 00054 fflush(stdout); 00055 00056 // Move through the FASTA file reading descriptions and sequences 00057 while (readFasta_readSequence()) 00058 { 00059 // Get sequence just read 00060 sequence = readFasta_sequenceBuffer; 00061 sequenceLength = readFasta_sequenceLength; 00062 00063 // Encode the sequence 00064 encoding_encodeSequence(sequence, sequenceLength, alphabetType); 00065 00066 // Convert nucleotide sequences to byte-packed format 00067 if (alphabetType == encoding_nucleotide) 00068 { 00069 // Replace any wilds with a random character 00070 totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength); 00071 00072 // Declare memory to hold wildcard data 00073 startWildcardData = global_realloc(startWildcardData, 00074 sizeof(char) * wildcardEdits->numEntries * 5); 00075 wildcardData = startWildcardData; 00076 00077 // For each wildcard edit, encode details using chars and vbytes 00078 memSingleBlock_resetCurrent(wildcardEdits); 00079 while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL) 00080 { 00081 // Record wild character 00082 *wildcardData = wildcardEdit->code; 00083 wildcardData++; 00084 00085 // Convert the position to a vbyte 00086 vbyte_putVbyte(wildcardData, wildcardEdit->position); 00087 } 00088 } 00089 else 00090 { 00091 startWildcardData = wildcardData = NULL; 00092 } 00093 00094 // printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout); 00095 00096 // Add sequence to the formatted collection 00097 writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer, 00098 readFasta_descriptionLength, startWildcardData, 00099 wildcardData - startWildcardData, NULL, 0); 00100 00101 // Print status dots 00102 if (writedb_sequenceCount % 10000 == 0) 00103 { 00104 printf("."); 00105 fflush(stdout); 00106 } 00107 } 00108 00109 // Close fasta reader 00110 readFasta_close(); 00111 00112 // Finalize writing to the formatted collection 00113 writedb_close(); 00114 00115 printf("done.\n"); 00116 printf("%d sequences processed.\n", writedb_sequenceCount); 00117 printf("%llu letters processed.\n", writedb_numberOfLetters); 00118 printf("%d wildcards encoded.\n", totalWilds); 00119 printf("%d volume(s) created.\n", writedb_volume + 1); 00120 printf("Longest/shortest sequence was %d/%d letters\n", 00121 writedb_maximumSequenceLength, writedb_minimumSequenceLength); 00122 fflush(stdout); 00123 00124 return 0; 00125 }
Here is the call graph for this function:

1.5.2