src/chooseWilds.c File Reference

#include "blast.h"

Include dependency graph for chooseWilds.c:

Go to the source code of this file.

Functions

void chooseWilds_printOccurenceMatrix (struct wild *wilds, uint4 numWilds)
int4 main (int4 argc, char *argv[])


Function Documentation

void chooseWilds_printOccurenceMatrix ( struct wild wilds,
uint4  numWilds 
)

Definition at line 212 of file chooseWilds.c.

References wild::code, wild::count, encoding_getLetter(), encoding_numLetters, getbit, and uint4.

Referenced by main().

00213 {
00214         uint4 count, aa1, aa2, code;
00215         uint4 counts[encoding_numLetters][encoding_numLetters];
00216 
00217     aa1 = 0;
00218     while (aa1 < encoding_numLetters)
00219     {
00220         aa2 = 0;
00221         while (aa2 < encoding_numLetters)
00222         {
00223             counts[aa1][aa2] = 0;
00224             aa2++;
00225         }
00226         aa1++;
00227     }
00228 
00229     count = 0;
00230     while (count < numWilds)
00231     {
00232                 code = wilds[count].code;
00233 
00234         aa1 = 0;
00235         while (aa1 < encoding_numLetters)
00236         {
00237             aa2 = 0;
00238             while (aa2 < encoding_numLetters)
00239             {
00240                 if (getbit(code, aa1) && getbit(code, aa2))
00241                 {
00242                                         counts[aa1][aa2] += wilds[count].count;
00243                 }
00244                 aa2++;
00245             }
00246                 aa1++;
00247         }
00248 
00249 //        printf("%d: ", wilds[count].count);
00250 //      wildcards_printWildcard(wilds[count].code);
00251         count++;
00252     }
00253 
00254     aa1 = 0;
00255     while (aa1 < encoding_numLetters)
00256     {
00257         printf("%c ", encoding_getLetter(aa1));
00258 
00259         aa2 = 0;
00260         while (aa2 < aa1)
00261         {
00262             printf("%6d ", counts[aa1][aa2]);
00263             aa2++;
00264         }
00265         printf("\n");
00266         aa1++;
00267     }
00268 
00269     printf("  ");
00270     aa2 = 0;
00271     while (aa2 < encoding_numLetters)
00272     {
00273         printf("     %c ", encoding_getLetter(aa2));
00274         aa2++;
00275     }
00276     printf("\n");
00277 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 main ( int4  argc,
char *  argv[] 
)

Definition at line 13 of file chooseWilds.c.

References chooseWilds_printOccurenceMatrix(), wild::code, wild::count, encoding_alphabetType, encoding_alphabetTypes, encoding_initialize(), encoding_numLetters, encoding_protein, global_malloc(), parameters_findScoringMatrix(), parameters_scoringMatrixPath, readdb_dbAlphabetType, readdb_getChildren(), readdb_longestSequenceLength, readdb_nextVolume(), readdb_numberOfClusters, readdb_numberOfLetters, readdb_numberOfSequences, readdb_numberOfVolumes, readdb_open(), readdb_readSequence(), scoreMatrix_load(), setbit, uint4, wildcards_averageResidueWildMatch(), wildcards_countOccurences(), wildcards_getOccurences(), wildcards_getSubset(), wildcards_initializeCountOccurences(), wildcards_numClusterWildcards, wildcards_outputWildcards(), wildcards_printWildcard(), wildcards_scoreCandidates(), wildcards_scoreMatrix, and wildcards_scoringConstant.

00014 {
00015         unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename;
00016         uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
00017         uint4 encodedLength, numChildren, count;
00018         char *description;
00019     struct child* children, *child;
00020     uint4 candidateNum, change, childNum;
00021     uint4 numWilds = 0;
00022         struct wild *wilds, defaultWild, *candidates, bestNewCandidate;
00023         struct wild *wildSubset, *newCandidates, *bestNewCandidates;
00024     uint4 sizeWildSubset, numOccurences, numCandidates;
00025         float defaultWildscore, candidatesScore, bestScore;
00026 
00027         // User must provide FASTA format file at command line
00028         if (argc < 4)
00029         {
00030                 fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> <Wildcards output file>\n");
00031                 exit(-1);
00032         }
00033         filename = argv[1];
00034     wildcards_scoringConstant = atof(argv[2]);
00035     wildcardsFilename = argv[3];
00036 
00037         readdb_open(filename);
00038 
00039     printf("Number of clusters = %u\n", readdb_numberOfClusters);
00040     printf("Number of sequences = %u\n", readdb_numberOfSequences);
00041     printf("Number of volumes = %u\n", readdb_numberOfVolumes);
00042         printf("Total number of letters = %llu\n", readdb_numberOfLetters);
00043         printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
00044         printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);
00045 
00046         // Initialize codes array
00047         encoding_initialize(readdb_dbAlphabetType);
00048 
00049     // Load score matrix
00050     parameters_findScoringMatrix();
00051     wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);
00052 
00053     // Count occurences of each wildcard set
00054     wildcards_initializeCountOccurences(readdb_longestSequenceLength);
00055     do
00056     {
00057         // Read each sequence in the collection
00058         while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
00059                                    &descriptionLength, &encodedLength))
00060         {
00061                 // If a protein sequence cluster
00062             if (encoding_alphabetType == encoding_protein && sequenceLength + 2 != encodedLength)
00063             {
00064                 // Get the children
00065                 children = readdb_getChildren(sequence, sequenceLength, encodedLength,
00066                                               descriptionStart, &numChildren);
00067 
00068                                 // Add to list of occurences
00069                 wildcards_countOccurences(children, numChildren, sequenceLength);
00070 
00071                 childNum = 0;
00072                 while (childNum < numChildren)
00073                 {
00074                     free(children[childNum].edits);
00075                     free(children[childNum].sequence - 1);
00076                     childNum++;
00077                 }
00078 
00079                 free(children);
00080             }
00081         }
00082         }
00083     while (readdb_nextVolume());
00084 
00085     // Get final list of number of occurences of each wild
00086     wilds = wildcards_getOccurences(&numWilds);
00087 
00088     chooseWilds_printOccurenceMatrix(wilds, numWilds);
00089 
00090     // Build default wildcard
00091         defaultWild.code = 0;
00092     defaultWild.count = 0;
00093     code = 0;
00094     while (code < encoding_numLetters)
00095     {
00096         setbit(defaultWild.code, code);
00097         code++;
00098     }
00099 
00100     // Get average score for default wildcard
00101     wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds, &sizeWildSubset, &numOccurences);
00102     defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset, sizeWildSubset);
00103         printf("defaultWildScore=%f occurences=%d\n", defaultWildscore, numOccurences);
00104 
00105     // Build up list of wildcard candidates
00106         candidates = (struct wild*)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards);
00107     numCandidates = 0;
00108     while (numCandidates < wildcards_numClusterWildcards - 1)
00109     {
00110         // Explore each possible option to add to list of candidates
00111         count = 0;
00112         bestScore = 0;
00113         while (count < numWilds)
00114         {
00115 //              printf("set pos %d to ", numCandidates);
00116 //                      wildcards_printWildcard(wilds[count].code);
00117                         candidates[numCandidates] = wilds[count];
00118 
00119             // Score a set of candidates
00120             candidatesScore = wildcards_scoreCandidates(candidates, numCandidates + 1,
00121                                                         wilds, numWilds, defaultWildscore);
00122 //            printf("Candidates saving=%f\n", candidatesScore);
00123                         if (candidatesScore > bestScore)
00124             {
00125                 bestScore = candidatesScore;
00126                 bestNewCandidate = wilds[count];
00127             }
00128 
00129             count++;
00130         }
00131 
00132         printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates);
00133                 wildcards_printWildcard(bestNewCandidate.code);
00134                 candidates[numCandidates] = bestNewCandidate;
00135 
00136                 numCandidates++;
00137     }
00138 
00139     newCandidates = (struct wild*)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards);
00140     bestNewCandidates = (struct wild*)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards);
00141 
00142     // Perform hill climbing; consider changing each position
00143     change = 1;
00144     while (change)
00145     {
00146         change = 0;
00147                 candidateNum = 0;
00148         bestScore = 0;
00149         while (candidateNum < numCandidates)
00150         {
00151                 // Start with current candidates
00152                         memcpy(newCandidates, candidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1);
00153 
00154             // Change current position to every possible candidate
00155             count = 0;
00156             while (count < numWilds)
00157             {
00158                 newCandidates[candidateNum] = wilds[count];
00159 
00160                 // Score a possible new set of candidates
00161                 candidatesScore = wildcards_scoreCandidates(newCandidates, numCandidates,
00162                                                             wilds, numWilds, defaultWildscore);
00163 
00164                                 // Check if best new candidates
00165                 if (candidatesScore > bestScore)
00166                 {
00167                     bestScore = candidatesScore;
00168                     memcpy(bestNewCandidates, newCandidates, sizeof(struct wild) *
00169                            wildcards_numClusterWildcards - 1);
00170                 }
00171 
00172                 count++;
00173             }
00174 
00175                 candidateNum++;
00176         }
00177 
00178         // Update candidates
00179         if (bestScore > wildcards_scoreCandidates(candidates, numCandidates,
00180                                                   wilds, numWilds, defaultWildscore))
00181                 {
00182                 printf("New bestScore=%f\n", bestScore);
00183                 memcpy(candidates, bestNewCandidates, sizeof(struct wild) *
00184                    wildcards_numClusterWildcards - 1);
00185                         change = 1;
00186         }
00187 
00188                 candidateNum = 0;
00189         while (candidateNum < numCandidates)
00190         {
00191                         wildcards_printWildcard(candidates[candidateNum].code);
00192                 candidateNum++;
00193                 }
00194     }
00195 
00196     // Print out final set of clusters with default wild added
00197         candidates[numCandidates] = defaultWild; numCandidates++;
00198         wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore);
00199         wildcards_outputWildcards(wildcardsFilename);
00200 
00201     printf("%d sequences read.\n", readdb_numberOfSequences);
00202         fflush(stdout);
00203 
00204     free(candidates);
00205     free(newCandidates);
00206     free(bestNewCandidates);
00207 
00208         return 0;
00209 }

Here is the call graph for this function:


Generated on Wed Dec 19 20:54:15 2007 for fsa-blast by  doxygen 1.5.2