src/rsdb.c File Reference

#include "blast.h"
#include "vbyte2.h"
#include "readindex.h"
#include "vec.h"
#include "vbyte2.c"
#include "readindex.c"
#include "vec.c"
#include "identityAlign.c"

Include dependency graph for rsdb.c:

Go to the source code of this file.

Data Structures

struct  longList
struct  sequenceLists
struct  diagonal
struct  sequenceMatch
struct  sequence
struct  parent

Defines

#define numHashtablePasses   50
#define wildcardSumWindow   100
#define percentMatchesRequired   1
#define rsdb_threshold   90

Functions

int4 rsdb_compareDescriptionLocations (const void *sequence1, const void *sequence2)
int4 rsdb_compareSequenceLengths (const void *sequence1, const void *sequence2)
void rsdb_printSequence (uint4 whitespace, unsigned char *sequence, uint4 length)
void rsdb_processSequenceMatches (struct memSingleBlock *sequenceMatches, struct sequence *sequences)
void rsdb_processList (struct memSingleBlock *diagonals, struct wordLocation *wordLocations, uint4 numWordLocations)
void rsdb_printCluster (struct parent *parent)
uint4 rsdb_score (struct sequence *seq1, struct sequence *seq2, int4 relativeOffset, float wildscoreAllowed)
void rsdb_newParent (struct parent *newParent, struct sequence *child)
void rsdb_addChild (struct parent *parent, struct sequence *child, int4 relativeOffset)
wordLocationrsdb_mergeLists (struct wordLocation *wordLocations1, uint4 *numWordLocations1, struct wordLocation *wordLocations2, uint4 *numWordLocations2, int4 mergeRelativeOffset, uint *lengthNewList)
void rsdb_printList (struct wordLocation *list, uint4 length)
int4 rsdb_calculateSavings ()
void rsdb_writeClusters (char *filename, struct sequence *sequences)
void rsdb_freePSSM ()
void rsdb_spexClusterSequences (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters)
void rsdb_updateDiagonalMatches (struct memSingleBlock *diagonals, struct wordLocation location1, struct wordLocation location2)
int4 main (int4 argc, char *argv[])
int4 rsdb_compareListLengths (const void *list1, const void *list2)
Hashcounterrsdb_slottedSpex (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters)
Hashcounterrsdb_winnowingSpex (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters)

Variables

uint4 decoWordLength = 30
uint4 numIterations = 3
uint4 decoQantum = 9
uint4 maxListLength = 20
float rsdb_averageMatchScore = 0
float * rsdb_averageWildcodeScores = NULL
float wildcardsThreshold = 0.25
uint4 rsdb_numMergeDiagonals
memBlocksrsdb_parents
uint4 rsdb_numParents = 0
uint4 ** rsdb_PSSM
int4 rsdb_PSSMstart
int4 rsdb_PSSMend
int4 rsdb_PSSMlength
uint4 rsdb_accumulatorsSize = 0


Define Documentation

#define numHashtablePasses   50

Definition at line 23 of file rsdb.c.

Referenced by cluster_spexClusterSequences(), and rsdb_spexClusterSequences().

#define percentMatchesRequired   1

Definition at line 25 of file rsdb.c.

#define rsdb_threshold   90

Definition at line 26 of file rsdb.c.

Referenced by rsdb_score().

#define wildcardSumWindow   100

Definition at line 24 of file rsdb.c.


Function Documentation

int4 main ( int4  argc,
char *  argv[] 
)

Definition at line 123 of file rsdb.c.

References scoreMatrix::averageMatchScore, decoQantum, decoWordLength, sequence::descriptionLength, sequence::descriptionLocation, encoding_alphabetTypes, encoding_free(), encoding_initialize(), encoding_protein, global_free(), global_malloc(), identityAlign_free(), sequence::length, maxListLength, memBlocks_free(), memBlocks_initialize(), sequence::number, numIterations, parameters_findScoringMatrix(), parameters_free(), parameters_scoringMatrixPath, sequence::parent, readdb_close(), readdb_dbAlphabetType, readdb_longestSequenceLength, readdb_numberOfClusters, readdb_numberOfLetters, readdb_numberOfSequences, readdb_open(), readdb_readSequence(), rsdb_averageMatchScore, rsdb_averageWildcodeScores, rsdb_calculateSavings(), rsdb_compareSequenceLengths(), rsdb_parents, rsdb_spexClusterSequences(), rsdb_writeClusters(), scoreMatrix_free(), scoreMatrix_load(), sequence::sequence, wordLocation::sequenceNumber, uint4, wildcards_initialize(), wildcards_readWildcards(), and wildcards_scoreMatrix.

00124 {
00125         unsigned char *filename, *sequence;
00126         uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
00127         uint4 sequenceNumber = 0, encodedLength;
00128         struct sequence* sequences;
00129 
00130         // User must provide FASTA format file at command line
00131         if (argc != 2 && argc != 6)
00132         {
00133                 fprintf(stderr, "Useage: rsdb <database> [decoWordLength] [numIterations] [decoQantum] [maxListLength]\n");
00134                 exit(-1);
00135         }
00136         filename = argv[1];
00137 
00138     if (argc == 6)
00139     {
00140         decoWordLength = atoi(argv[2]);
00141         numIterations = atoi(argv[3]);
00142         decoQantum = atoi(argv[4]);
00143         maxListLength = atoi(argv[5]);
00144         }
00145 
00146     // Open formatted file for reading
00147     readdb_open(filename);
00148 
00149         printf("Number of sequences = %u\n", readdb_numberOfSequences);
00150         printf("Total number of letters = %llu\n", readdb_numberOfLetters);
00151         printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
00152         printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);
00153 
00154     if (readdb_dbAlphabetType != encoding_protein)
00155     {
00156         fprintf(stderr, "Error: Only protein collection clustering currently supported\n");
00157                 exit(-1);
00158     }
00159 
00160     if (readdb_numberOfSequences != readdb_numberOfClusters)
00161     {
00162         fprintf(stderr, "Error clustering %s: collection has already been clustered\n", filename);
00163         exit(-1);
00164     }
00165 
00166     sequences = (struct sequence*)global_malloc(sizeof(struct sequence) * readdb_numberOfSequences);
00167 
00168         // Initialize codes array
00169         encoding_initialize(readdb_dbAlphabetType);
00170 
00171     // Initialize cluster wildcards
00172         wildcards_initialize();
00173 
00174     if (argc == 5)
00175     {
00176         // Read in a set of wildcards
00177         wildcards_readWildcards(argv[4]);
00178     }
00179 
00180     // Load score matrix
00181     parameters_findScoringMatrix();
00182     wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);
00183 
00184     // Calculate average score for matching two residues
00185     rsdb_averageMatchScore = wildcards_scoreMatrix.averageMatchScore;
00186 
00187     // Move through index file reading sequence codes and converting back to ASCII sequences
00188     sequenceNumber = 0;
00189         while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
00190                                &descriptionLength, &encodedLength))
00191         {
00192         // Record sequence, sequence length, number, no parent
00193         sequences[sequenceNumber].sequence = sequence;
00194         sequences[sequenceNumber].length = sequenceLength;
00195         sequences[sequenceNumber].number = sequenceNumber;
00196         sequences[sequenceNumber].parent = NULL;
00197         sequences[sequenceNumber].descriptionLength = descriptionLength;
00198         sequences[sequenceNumber].descriptionLocation = descriptionStart;
00199 
00200         sequenceNumber++;
00201     }
00202 
00203     if (sequenceNumber != readdb_numberOfSequences)
00204     {
00205         fprintf(stderr, "Error: only %d sequences read\n", sequenceNumber);
00206         exit(-1);
00207     }
00208 
00209     printf("%d sequences read.\n", sequenceNumber);
00210         fflush(stdout);
00211 
00212     // Sort sequences by length
00213     qsort(sequences, readdb_numberOfSequences, sizeof(struct sequence),
00214           rsdb_compareSequenceLengths);
00215 
00216     // Update sequence numbers
00217     sequenceNumber = 0;
00218         while (sequenceNumber < readdb_numberOfSequences)
00219         {
00220         sequences[sequenceNumber].number = sequenceNumber; 
00221         sequenceNumber++;
00222         }
00223 
00224     // Initialize array for storing parents
00225         rsdb_parents = memBlocks_initialize(sizeof(struct parent), 10000);
00226 
00227     // Simple approach to clustering sequences
00228 //    rsdb_simpleClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters);
00229 
00230         // Perform SPEX algorithm and cluster sequences
00231     rsdb_spexClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters);
00232 
00233     // Cluster sequences in the collection
00234 //    rsdb_clusterSequences(sequences, sequenceNumber, readdb_numberOfLetters, filename);
00235 
00236     printf("Total bytes saved=%d\n", rsdb_calculateSavings()); fflush(stdout);
00237 
00238 //    printf("END Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout);
00239 
00240     // Load sequence descriptions from disk
00241 //    rsdb_loadDescriptions(sequences);
00242 
00243         // Write clusters to disk
00244     rsdb_writeClusters(filename, sequences);
00245 
00246     // Close reading formatted database
00247     readdb_close();
00248 
00249     free(rsdb_averageWildcodeScores);
00250     free(sequences);
00251     memBlocks_free(rsdb_parents);
00252     encoding_free();
00253     global_free();
00254     scoreMatrix_free(wildcards_scoreMatrix);
00255     parameters_free();
00256     identityAlign_free();
00257     return 0;
00258 }

Here is the call graph for this function:

void rsdb_addChild ( struct parent parent,
struct sequence child,
int4  relativeOffset 
)

Definition at line 1221 of file rsdb.c.

References parent::children, global_realloc(), parent::length, sequence::length, parent::numChildren, sequence::parent, and sequence::regionStart.

Referenced by rsdb_spexClusterSequences().

01222 {
01223     // If child is longer than parent
01224     if (child->length > parent->length)
01225         return;
01226 
01227     // Add new child
01228     child->regionStart = 0;
01229 //    child->length = parent->length;
01230     child->parent = parent;
01231 //    child->sequence = parent->sequence;
01232     parent->numChildren++;
01233         parent->children = (struct sequence**)global_realloc(parent->children,
01234                        sizeof(struct sequence*) * parent->numChildren);
01235         parent->children[parent->numChildren - 1] = child;
01236 
01237 //      printf("Added %d: \n", child->number);
01238 //    rsdb_printCluster(parent);
01239 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 rsdb_calculateSavings (  ) 

Definition at line 1120 of file rsdb.c.

References parent::children, int4, parent::length, sequence::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), sequence::number, parent::numChildren, sequence::parent, and rsdb_parents.

Referenced by main().

01121 {
01122         int4 count, childNum, numCharsSaved, bytesSaved = 0, numWilds;
01123     struct sequence* child;
01124         struct parent* parent;
01125 
01126     // For each parent
01127     memBlocks_resetCurrent(rsdb_parents);
01128         while ((parent = memBlocks_getCurrent(rsdb_parents)) != NULL)
01129     {
01130         numWilds = 0;
01131 
01132         // If it has children
01133         if (parent->children != NULL)
01134         {
01135             // For each child calculate number of characters saved
01136             numCharsSaved = 0;
01137             childNum = 0;
01138             while (childNum < parent->numChildren)
01139             {
01140                 child = parent->children[childNum];
01141 
01142                 // CHECK: no child has two parents
01143                 if (child->parent != parent)
01144                 {
01145                         printf("Error sequence %d has multiple parents\n", child->number);
01146                     exit(-1);
01147                 }
01148 
01149                 numCharsSaved += child->length;
01150                 childNum++;
01151             }
01152 
01153             // Minus cost of parent
01154             bytesSaved += numCharsSaved - parent->length;
01155 
01156             if (numCharsSaved < parent->length)
01157             {
01158                                 printf("Error parent length %d with %d children saves negative bytes",
01159                         parent->length, parent->numChildren);
01160                 exit(-1);
01161             }
01162         }
01163     }
01164 
01165         return bytesSaved;
01166 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 rsdb_compareDescriptionLocations ( const void *  sequence1,
const void *  sequence2 
)

Definition at line 352 of file rsdb.c.

References sequence::descriptionLocation.

Referenced by rsdb_writeClusters().

00353 {
00354         const struct sequence *a1, *a2;
00355 
00356         a1 = (struct sequence*)sequence1;
00357         a2 = (struct sequence*)sequence2;
00358 
00359         if (a1->descriptionLocation > a2->descriptionLocation)
00360         {
00361                 return -1;
00362         }
00363         else if (a1->descriptionLocation < a2->descriptionLocation)
00364         {
00365                 return 1;
00366         }
00367         else
00368         {
00369                 return 0;
00370         }
00371 }

Here is the caller graph for this function:

int4 rsdb_compareListLengths ( const void *  list1,
const void *  list2 
)

Definition at line 396 of file rsdb.c.

References finalList::numEntries.

00397 {
00398         const struct finalList *a1, *a2;
00399 
00400         a1 = (struct finalList*)list1;
00401         a2 = (struct finalList*)list2;
00402 
00403         if (a1->numEntries > a2->numEntries)
00404         {
00405                 return -1;
00406         }
00407         else if (a1->numEntries < a2->numEntries)
00408         {
00409                 return 1;
00410         }
00411         else
00412         {
00413                 return 0;
00414         }
00415 }

int4 rsdb_compareSequenceLengths ( const void *  sequence1,
const void *  sequence2 
)

Definition at line 374 of file rsdb.c.

References sequence::length.

Referenced by main().

00375 {
00376         const struct sequence *a1, *a2;
00377 
00378         a1 = (struct sequence*)sequence1;
00379         a2 = (struct sequence*)sequence2;
00380 
00381         if (a1->length > a2->length)
00382         {
00383                 return -1;
00384         }
00385         else if (a1->length < a2->length)
00386         {
00387                 return 1;
00388         }
00389         else
00390         {
00391                 return 0;
00392         }
00393 }

Here is the caller graph for this function:

void rsdb_freePSSM (  ) 

struct wordLocation* rsdb_mergeLists ( struct wordLocation wordLocations1,
uint4 *  numWordLocations1,
struct wordLocation wordLocations2,
uint4 *  numWordLocations2,
int4  mergeRelativeOffset,
uint *  lengthNewList 
) [read]

void rsdb_newParent ( struct parent newParent,
struct sequence child 
)

Definition at line 1205 of file rsdb.c.

References parent::children, global_malloc(), sequence::length, parent::length, parent::number, parent::numChildren, sequence::parent, sequence::regionStart, rsdb_numParents, sequence::sequence, and parent::sequence.

Referenced by rsdb_spexClusterSequences().

01206 {
01207         // Make copy of the child
01208     parent->number = rsdb_numParents; rsdb_numParents++;
01209         parent->length = child->length;
01210         parent->sequence = child->sequence;
01211 
01212     // Link parent to child, and vice versa
01213     parent->numChildren = 1;
01214     parent->children = (struct sequence**)global_malloc(sizeof(struct sequence*));
01215     parent->children[0] = child;
01216     child->regionStart = 0;
01217     child->parent = parent;
01218 }

Here is the call graph for this function:

Here is the caller graph for this function:

void rsdb_printCluster ( struct parent parent  ) 

Definition at line 1169 of file rsdb.c.

References parent::children, int4, sequence::length, parent::length, sequence::number, parent::number, parent::numChildren, sequence::parent, print_singleSequence(), sequence::regionStart, sequence::sequence, and parent::sequence.

01170 {
01171         int4 count, childNum, numCharsSaved;
01172     struct sequence* child;
01173 
01174     printf("### Parent %d with %d children ###\nParent    ", parent->number, parent->numChildren);
01175         print_singleSequence(parent->sequence, parent->length); printf("\n");
01176 
01177     // For each child
01178     numCharsSaved = 0;
01179     childNum = 0;
01180     while (childNum < parent->numChildren)
01181     {
01182         child = parent->children[childNum];
01183 
01184         printf("[%7d] ", child->number);
01185         // Print spaces to align child to parent
01186         count = 0;
01187         while (count < child->regionStart)
01188         {
01189             printf(" ");
01190             count++;
01191         }
01192 
01193         // Print child
01194         print_singleSequence(child->sequence, child->length); printf("\n");
01195 //              printf("Child Length=%d start=%d\n", child->length, child->regionStart);
01196 
01197         numCharsSaved += child->length;
01198         childNum++;
01199     }
01200     printf("%d bytes saved\n", numCharsSaved - parent->length);
01201     printf("\n");
01202 }

Here is the call graph for this function:

void rsdb_printList ( struct wordLocation list,
uint4  length 
)

Definition at line 1107 of file rsdb.c.

References uint4.

01108 {
01109         uint4 count = 0;
01110 
01111     while (count < length)
01112     {
01113         printf("[%d,%d] ", list[count].sequenceNumber, list[count].offset);
01114         count++;
01115     }
01116     printf(".\n");
01117 }

void rsdb_printSequence ( uint4  whitespace,
unsigned char *  sequence,
uint4  length 
)

Definition at line 1094 of file rsdb.c.

References print_singleSequence().

01095 {
01096     // Print whitespace before sequence begins
01097     while (whitespace > 0)
01098     {
01099         printf(" ");
01100         whitespace--;
01101     }
01102 
01103     print_singleSequence(sequence, length); printf("\n");
01104 }

Here is the call graph for this function:

void rsdb_processList ( struct memSingleBlock diagonals,
struct wordLocation wordLocations,
uint4  numWordLocations 
)

Definition at line 1013 of file rsdb.c.

References rsdb_updateDiagonalMatches(), wordLocation::sequenceNumber, and uint4.

Referenced by rsdb_spexClusterSequences().

01015 {
01016         uint4 count1, count2;
01017     struct wordLocation location1, location2;
01018 
01019     // For each pair of locations
01020     count1 = 0;
01021     while (count1 < numWordLocations)
01022     {
01023         count2 = count1 + 1;
01024         while (count2 < numWordLocations)
01025         {
01026             // location1 < location2
01027             if (wordLocations[count1].sequenceNumber < wordLocations[count2].sequenceNumber)
01028             {
01029                 location1 = wordLocations[count1];
01030                 location2 = wordLocations[count2];
01031             }
01032             else
01033             {
01034                 location2 = wordLocations[count1];
01035                 location1 = wordLocations[count2];
01036             }
01037 
01038             // Update accumulators by adding match between two locations
01039             rsdb_updateDiagonalMatches(diagonals, location1, location2);
01040 
01041             count2++;
01042         }
01043         count1++;
01044     }
01045 
01046 //    printf("listLength=%d numNewDiagonalMatches=%d\n", numWordLocations,
01047 //           numNewDiagonalMatches);
01048 }

Here is the call graph for this function:

Here is the caller graph for this function:

void rsdb_processSequenceMatches ( struct memSingleBlock sequenceMatches,
struct sequence sequences 
)

uint4 rsdb_score ( struct sequence seq1,
struct sequence seq2,
int4  relativeOffset,
float  wildscoreAllowed 
)

Definition at line 1242 of file rsdb.c.

References identityAlign_score(), int4, sequence::length, rsdb_threshold, sequence::sequence, and uint4.

Referenced by rsdb_spexClusterSequences().

01244 {
01245     uint4 score, targetScore, identity;
01246     unsigned char* sequence1, *sequence2;
01247     int4 length1, length2, shorterLength;
01248 
01249     sequence1 = seq1->sequence;
01250     length1 = seq1->length;
01251     sequence2 = seq2->sequence;
01252     length2 = seq2->length;
01253 
01254     if (length1 < length2)
01255         shorterLength = length1;
01256         else
01257         shorterLength = length2;
01258 
01259     targetScore = (int)((float)rsdb_threshold * (float)shorterLength / 100.0);
01260     score = identityAlign_score(sequence2, length2, sequence1, length1, -relativeOffset,
01261                                 targetScore);
01262 
01263         identity = 100.0 * (float)score / (float)shorterLength;
01264 
01265     if (identity < rsdb_threshold)
01266         return 0;
01267 
01268 //      printf("score/target=%d,%d\n", score, targetScore);
01269 
01270 //    printf("lengths=%d,%d\n", length1, length2);
01271 //    printf("%d:%d\n", (end1 - start1), score);
01272 
01273     // Return score
01274     return identity;
01275 }

Here is the call graph for this function:

Here is the caller graph for this function:

Hashcounter* rsdb_slottedSpex ( struct sequence sequences,
uint4  numberOfSequences,
uint4  numberOfLetters 
)

Definition at line 418 of file rsdb.c.

References decoQantum, decoWordLength, hashcounter_count(), hashcounter_free(), hashcounter_hash, hashcounter_insert(), hashcounter_multiple(), hashcounter_new(), hashcounter_single(), sequence::length, numIterations, sequence::sequence, and uint4.

Referenced by rsdb_spexClusterSequences().

00420 {
00421         Hashcounter *hashCounter1 = NULL, *hashCounter2 = NULL;
00422         uint4 hashCounterSize = 1;
00423     uint4 words, duplicates, iteration = 0, seed, wordLength;
00424         uint4 sinceChange, submatches, lookup;
00425     uint4 sequenceNumber, sequenceSize, offset, suboffset;
00426         unsigned char* sequence, *word;
00427         uint4 time;
00428     uint4 hashValue1, hashValue2, position;
00429 
00430     // Calculate size of hashcounter
00431     while (pow(2,hashCounterSize) < numberOfLetters / decoQantum)
00432     {
00433         hashCounterSize++;
00434     }
00435     hashCounter2 = hashcounter_new(hashCounterSize, 0);
00436 
00437     wordLength = decoWordLength - ((numIterations - 1) * decoQantum);
00438 
00439     while (iteration < numIterations)
00440     {
00441         time = -clock();
00442         words = 0; duplicates = 0;
00443         sinceChange = 0;
00444 
00445         // For each sequence
00446         sequenceNumber = 0;
00447         while (sequenceNumber < numberOfSequences)
00448         {
00449             sequence = sequences[sequenceNumber].sequence;
00450             sequenceSize = sequences[sequenceNumber].length;
00451 
00452             // For each sequence longer enough that hasn't been excluded
00453             if (sequenceSize >= wordLength)
00454             {
00455                 // For each word
00456                 offset = 0;
00457                 while (offset < sequenceSize - wordLength)
00458                 {
00459                     // If word has occured before
00460                     word = sequence + offset;
00461                     hashcounter_hash(hashCounter2, word, wordLength, hashValue2, position);
00462                     lookup = hashcounter_single(hashCounter2, hashValue2);
00463 
00464                     if (lookup)
00465                         sinceChange = 0;
00466 
00467                     if (lookup == 1)
00468                     {
00469                         // If it has only occured once, now it has occured twice
00470                         hashcounter_insert(hashCounter2, hashValue2);
00471                     }
00472                     else
00473                     {
00474                                                 submatches = 0;
00475                         // First iteration
00476                         if (hashCounter1 != NULL)
00477                         {
00478                                 suboffset = offset;
00479                                 while (suboffset <= offset + decoQantum)
00480                             {
00481                                 word = sequence + suboffset;
00482                                 hashcounter_hash(hashCounter1, word, wordLength - decoQantum, hashValue1, position);
00483                                 if (hashcounter_multiple(hashCounter1, hashValue1))
00484                                         submatches++;
00485                                                                 suboffset++;
00486                             }
00487                         }
00488                         if (sinceChange >= decoQantum && (hashCounter1 == NULL || submatches >= 2))
00489                         {
00490                                                         hashcounter_insert(hashCounter2, hashValue2);
00491                             sinceChange = 0;
00492                         }
00493                                         }
00494 
00495                     sinceChange++;
00496                     offset++;
00497                 }
00498                         }
00499 
00500             sequenceNumber++;
00501         }
00502 
00503         if (hashCounter1)
00504                 hashcounter_free(hashCounter1);
00505                 hashCounter1 = hashCounter2;
00506         seed = rand();
00507 
00508         if (iteration != numIterations - 1)
00509                 hashCounter2 = hashcounter_new(hashCounterSize, 0);
00510 
00511         printf("Iteration %d WordLength=%d\n", iteration, wordLength); fflush(stdout);
00512         hashcounter_count(hashCounter1, stdout); fflush(stdout);
00513 //        printf("Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout);
00514 
00515         time += clock();
00516         printf("Iteration time=%.2f secs\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00517 
00518         iteration++;
00519         wordLength += decoQantum;
00520         }
00521 
00522     return hashCounter1;
00523 }

Here is the call graph for this function:

Here is the caller graph for this function:

void rsdb_spexClusterSequences ( struct sequence sequences,
uint4  numberOfSequences,
uint4  numberOfLetters 
)

Definition at line 682 of file rsdb.c.

References parent::children, decoWordLength, global_int4toString(), global_malloc(), global_totalMalloc, hashcounter_free(), hashcounter_hash, hashcounter_multiple(), int4, sequence::length, sequenceLists::longList, diagonal::matchSequence, maxListLength, memBlocks_free(), memBlocks_getCurrent(), memBlocks_initialize(), memBlocks_newEntry(), memBlocks_resetCurrent(), memSingleBlock_free(), memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_initializeExisting(), memSingleBlock_resetCurrent(), finalList::numEntries, memSingleBlock::numEntries, numHashtablePasses, sequence::parent, postings_addEntry(), postings_decodeList(), postings_getSortedLists(), postings_initialize(), postings_numLists, postings_print(), sequence::regionStart, diagonal::relativeOffset, rsdb_accumulatorsSize, rsdb_addChild(), rsdb_newParent(), rsdb_parents, rsdb_processList(), rsdb_score(), rsdb_slottedSpex(), rsdb_updateDiagonalMatches(), sequence::sequence, wordLocation::sequenceNumber, uint4, and wildcardsThreshold.

Referenced by main().

00684 {
00685         Hashcounter *hashCounter;
00686     uint4 seed, wordLength, position, hashValue;
00687     uint4 sequenceNumber, sequenceSize, offset;
00688         unsigned char* sequence, *word;
00689     uint4 hashtablePass = 0, passhash;
00690     int4 key, numWordLocations;
00691     struct wordLocation* wordLocations, *longListWordLocations;
00692     int4 relativeOffset;
00693     uint4 sequence1, sequence2, score, location1, location2;
00694         struct memSingleBlock* diagonals, *sequenceDiagonals;
00695     struct diagonal* diagonalMatch;
00696     struct sequenceMatch *sequenceMatch;
00697     struct memSingleBlock* sequenceMatches;
00698         struct sequence* seq1, *seq2;
00699     uint4 time, numPairs;
00700     struct parent* parent;
00701     struct finalList* finalLists;
00702     struct longList *longList;
00703     struct sequenceLists **sequencesLists, *sequenceLists;
00704     struct memBlocks *blockLongLists, *blockSequenceLists;
00705     uint4 newListLength;
00706 
00707     // Perform SPEX algorithm
00708 //    hashCounter = rsdb_winnowingSpex(sequences, numberOfSequences, numberOfLetters);
00709     hashCounter = rsdb_slottedSpex(sequences, numberOfSequences, numberOfLetters);
00710 
00711     // Memory to store long lists
00712     blockLongLists = memBlocks_initialize(sizeof(struct longList), 10000);
00713     blockSequenceLists = memBlocks_initialize(sizeof(struct sequenceLists), 10000);
00714 
00715     // Temporary buffer for storing word locations
00716     wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences);
00717 
00718     // Initialize struct to hold long lists
00719     sequencesLists = (struct sequenceLists**)global_malloc(sizeof(struct sequenceLists*) * numberOfSequences);
00720 
00721     // Initialize diagonals
00722     diagonals = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock)
00723                                                       * numberOfSequences);
00724         sequenceNumber = 0;
00725     while (sequenceNumber < numberOfSequences)
00726     {
00727         memSingleBlock_initializeExisting(diagonals + sequenceNumber, sizeof(struct diagonal), 1);
00728         sequencesLists[sequenceNumber] = NULL;
00729         sequenceNumber++;
00730     }
00731 
00732     printf("Initialized diagonals\n"); fflush(stdout);
00733 
00734     wordLength = decoWordLength;
00735     seed = rand();
00736 
00737     printf("Constructing and processing postings lists"); fflush(stdout);
00738     // Process collection a section at a time
00739     while (hashtablePass < numHashtablePasses)
00740     {
00741         time = -clock();
00742         postings_initialize();
00743 
00744         // For each sequence
00745         sequenceNumber = 0;
00746         while (sequenceNumber < numberOfSequences)
00747         {
00748             sequence = sequences[sequenceNumber].sequence;
00749             sequenceSize = sequences[sequenceNumber].length;
00750 
00751             // For each sequence long enough that hasn't been excluded
00752             if (sequenceSize >= wordLength)
00753             {
00754                 // Create initial passhash
00755                 passhash = 0;
00756                                 offset = 0;
00757                 while (offset < wordLength)
00758                 {
00759                                         passhash += sequence[offset];
00760                         offset++;
00761                 }
00762 
00763                 // For each word
00764                 offset = 0;
00765                 while (offset < sequenceSize - wordLength)
00766                 {
00767                         // If to be processed this pass
00768                     if (passhash % numHashtablePasses == hashtablePass)
00769                     {
00770                         // Insert into hash table
00771                         word = sequence + offset;
00772                         hashcounter_hash(hashCounter, word, wordLength, hashValue, position);
00773                         if (hashcounter_multiple(hashCounter, hashValue))
00774                         {
00775                                 postings_addEntry(sequence + offset, wordLength, sequenceNumber, offset);
00776                         }
00777                                         }
00778 
00779                     // Update passhash
00780                     passhash -= sequence[offset];
00781                     passhash += sequence[offset + wordLength];
00782                     offset++;
00783                 }
00784             }
00785 
00786             sequenceNumber++;
00787         }
00788 
00789         // Report the load on the hashtable
00790         printf("Hashtable pass %d\n", hashtablePass);
00791         postings_print();
00792         printf("Total malloced=%s\n", global_int4toString(global_totalMalloc));
00793                 time += clock();
00794                 printf("Hashtable construction time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00795                 time = -clock();
00796 
00797         // Sort hash table entries in order of size
00798                 finalLists = postings_getSortedLists();
00799 
00800         // Start with the longest list
00801         key = postings_numLists - 1;
00802         while (key >= 0)
00803         {
00804             // If list is too long to process
00805             if (finalLists[key].numEntries > maxListLength)
00806             {
00807                 // Decode the long list
00808                 longListWordLocations = (struct wordLocation*)global_malloc(
00809                                         sizeof(struct wordLocation) * finalLists[key].numEntries);
00810                 numWordLocations = postings_decodeList(finalLists[key], longListWordLocations);
00811 
00812                 longList = (struct longList*)memBlocks_newEntry(blockLongLists);
00813                 longList->list = longListWordLocations;
00814                 longList->length = numWordLocations;
00815 
00816                 rsdb_accumulatorsSize += sizeof(struct longList);
00817                                 rsdb_accumulatorsSize += sizeof(struct wordLocation) * finalLists[key].numEntries;
00818 
00819                 // For each sequence in the list
00820                 while (numWordLocations > 0)
00821                 {
00822                         // Add a reference to this long list
00823                         numWordLocations--;
00824                     sequenceLists = (struct sequenceLists*)memBlocks_newEntry(blockSequenceLists);
00825                     sequenceLists->next = sequencesLists[longListWordLocations[numWordLocations].sequenceNumber];
00826                     sequenceLists->longList = longList;
00827                                         sequencesLists[longListWordLocations[numWordLocations].sequenceNumber] = sequenceLists;
00828 
00829                     rsdb_accumulatorsSize += sizeof(struct sequenceLists);
00830                 }
00831             }
00832             else
00833             {
00834                 // Get each list
00835                 numWordLocations = postings_decodeList(finalLists[key], wordLocations);
00836 
00837                 // Process a list of word locations and insert/update diagonal entries
00838                 rsdb_processList(diagonals, wordLocations, numWordLocations);
00839                         }
00840 
00841             key--;
00842         }
00843 
00844                 time += clock();
00845                 printf("Hashtable processing time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00846                 printf("."); fflush(stdout);
00847 
00848         printf("Accumulators size=%.1f Mb\n", (float)rsdb_accumulatorsSize / 1024.0 / 1024.0);
00849 
00850         free(finalLists);
00851 
00852         hashtablePass++;
00853     }
00854 
00855         printf("done.\n"); fflush(stdout);
00856 
00857     // Free hash counter
00858     hashcounter_free(hashCounter);
00859 
00860     free(wordLocations);
00861 
00862     printf("Identifying high-scoring sequence pairs / hierarchical clustering..."); fflush(stdout);
00863     time = -clock();
00864 
00865     sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences);
00866 
00867     // For each sequence
00868     sequenceNumber = 0;
00869     numPairs = 0;
00870     while (sequenceNumber < numberOfSequences)
00871     {
00872         sequence1 = sequenceNumber;
00873         seq1 = sequences + sequence1;
00874 
00875         // Don't process sequences that already have parents
00876         if (seq1->parent == NULL)
00877         {
00878             // For each long list this sequence appears in
00879             sequenceLists = sequencesLists[sequenceNumber];
00880             while (sequenceLists != NULL)
00881             {
00882                 longList = sequenceLists->longList;
00883 
00884                 // Get the long list
00885                 wordLocations = longList->list;
00886                 numWordLocations = longList->length;
00887 
00888 //                              printf("Seq %d listLength %d\n", sequenceNumber, numWordLocations);
00889 
00890                 // Find entry for sequence 1
00891                 location1 = 0;
00892                 while (location1 < numWordLocations)
00893                 {
00894                         if (wordLocations[location1].sequenceNumber == sequence1)
00895                         break;
00896 
00897                     location1++;
00898                 }
00899 
00900                 // For every entry after that
00901                 location2 = location1 + 1;
00902                 newListLength = location2;
00903                 while (location2 < numWordLocations)
00904                 {
00905                     if (sequences[wordLocations[location2].sequenceNumber].parent == NULL)
00906                     {
00907                         // Update accumulators
00908                         rsdb_updateDiagonalMatches(diagonals, wordLocations[location1],
00909                                                       wordLocations[location2]);
00910 
00911                         // Remove clustered sequences from the list
00912                         wordLocations[newListLength] = wordLocations[location2];
00913 
00914                                                 newListLength++;
00915                     }
00916 
00917                     location2++;
00918                 }
00919 
00920 //                printf("Length Now %d\n", newListLength);
00921                 longList->length = newListLength;
00922 
00923                 sequenceLists = sequenceLists->next;
00924             }
00925 
00926             // Go through list of matching diagonals
00927             sequenceDiagonals = diagonals + sequenceNumber;
00928             memSingleBlock_resetCurrent(sequenceDiagonals);
00929             while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL)
00930             {
00931                 sequence2 = diagonalMatch->matchSequence;
00932                 relativeOffset = diagonalMatch->relativeOffset;
00933                 seq2 = sequences + sequence2;
00934 
00935 /*
00936                 if (minimum(seq1->length, seq2->length) > 10)
00937                 {
00938                     // Calculate rough score based on number of matches, length of overlap region
00939                     score = 100 * diagonalMatch->numMatches * decoWordLength /
00940                             minimum(seq1->length, seq2->length);
00941 
00942 //                    score = rsdb_score(seq1, seq2, relativeOffset, wildcardsThreshold);
00943 //                    if (score > 0)
00944                         printf("%d,%d matches=%d w=%d q=%d i=%d seqLength=%d\n",
00945                                seq1->number, seq2->number, diagonalMatch->numMatches, decoWordLength,
00946                                decoQantum, numIterations, minimum(seq1->length, seq2->length), score);
00947                                 }
00948 */
00949 
00950 //                if (score >= percentMatchesRequired)
00951                 {
00952                     numPairs++;
00953 
00954                     // Case 1 - neither sequence has a parent
00955                     if (seq1->parent == NULL && seq2->parent == NULL)
00956                     {
00957                         // Compare the two sequences in detail
00958                         score = rsdb_score(seq1, seq2, relativeOffset, wildcardsThreshold);
00959 
00960                         if (score > 0)
00961                         {
00962                 //              printf("New cluster\n"); fflush(stdout);
00963                             parent = (struct parent*)memBlocks_newEntry(rsdb_parents);
00964                             rsdb_newParent(parent, seq1);
00965                             rsdb_addChild(parent, seq2, relativeOffset);
00966                         }
00967                     }
00968                     // Case 2 - sequence1 has a parent, sequence2 does not
00969                     else if (seq1->parent != NULL && seq2->parent == NULL)
00970                     {
00971             //          printf("Add to %d\n", seq1->parent->numChildren); fflush(stdout);
00972 
00973                         // Check score for aligning sequence to parent
00974                         parent = seq1->parent;
00975                         relativeOffset += seq1->regionStart;
00976 
00977                         score = rsdb_score(parent->children[0], seq2, relativeOffset, wildcardsThreshold);
00978 
00979                         if (score > 0)
00980                         {
00981                             rsdb_addChild(seq1->parent, seq2, relativeOffset);
00982                         }
00983                     }
00984                 }
00985             }
00986                 }
00987 
00988         free(diagonals[sequenceNumber].block);
00989 
00990         sequenceNumber++;
00991     }
00992 
00993     printf("done. (%d pairs)\n", numPairs); fflush(stdout);
00994     time += clock();
00995     printf("Total time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00996 
00997     // Free long lists
00998     memBlocks_resetCurrent(blockLongLists);
00999     while ((longList = memBlocks_getCurrent(blockLongLists)) != NULL)
01000     {
01001         free(longList->list);
01002     }
01003         memBlocks_free(blockSequenceLists);
01004         memBlocks_free(blockLongLists);
01005 
01006         free(sequencesLists);
01007     free(diagonals);
01008 
01009     memSingleBlock_free(sequenceMatches);
01010 }

Here is the call graph for this function:

Here is the caller graph for this function:

void rsdb_updateDiagonalMatches ( struct memSingleBlock diagonals,
struct wordLocation  location1,
struct wordLocation  location2 
)

Definition at line 1051 of file rsdb.c.

References int4, diagonal::matchSequence, memSingleBlock_getCurrent(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), wordLocation::offset, diagonal::relativeOffset, rsdb_accumulatorsSize, and wordLocation::sequenceNumber.

Referenced by rsdb_processList(), and rsdb_spexClusterSequences().

01053 {
01054     struct memSingleBlock *sequenceDiagonals;
01055     struct diagonal* diagonalMatch;
01056         int4 relativeOffset;
01057     char matchFound;
01058 
01059     // If the locations don't both refer to the same sequence
01060     if (location1.sequenceNumber != location2.sequenceNumber)
01061     {
01062         // Search list of matching diagonals for this sequence for one that is the
01063         // same diagonal and matching sequence as the new match
01064         relativeOffset = location1.offset - location2.offset;
01065         matchFound = 0;
01066         sequenceDiagonals = diagonals + location1.sequenceNumber;
01067         memSingleBlock_resetCurrent(sequenceDiagonals);
01068         while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL)
01069         {
01070             if (diagonalMatch->relativeOffset == relativeOffset &&
01071                 diagonalMatch->matchSequence == location2.sequenceNumber)
01072             {
01073                 // If we found an identinal entry, increment number of matches
01074                 matchFound = 1;
01075 //                diagonalMatch->numMatches++;
01076                 break;
01077             }
01078         }
01079 
01080         // Otherwise create a new entry, ONLY if neither sequence1 or 2 belong to a cluster
01081         if (!matchFound)// && sequences[wordLocations[count1].sequenceNumber].parent == NULL
01082                         // && sequences[wordLocations[count2].sequenceNumber].parent == NULL)
01083         {
01084             diagonalMatch = memSingleBlock_newEntry(sequenceDiagonals);
01085             diagonalMatch->relativeOffset = relativeOffset;
01086             diagonalMatch->matchSequence = location2.sequenceNumber;
01087 //            diagonalMatch->numMatches = 1;
01088 
01089             rsdb_accumulatorsSize += sizeof(struct diagonal);
01090         }
01091     }
01092 }

Here is the call graph for this function:

Here is the caller graph for this function:

Hashcounter* rsdb_winnowingSpex ( struct sequence sequences,
uint4  numberOfSequences,
uint4  numberOfLetters 
)

Definition at line 526 of file rsdb.c.

References decoQantum, decoWordLength, hashcounter_count(), hashcounter_free(), hashcounter_hash, hashcounter_insert(), hashcounter_multiple(), hashcounter_new(), sequence::length, numIterations, sequence::sequence, and uint4.

00528 {
00529         Hashcounter *hashCounter1 = NULL, *hashCounter2 = NULL;
00530         uint4 hashCounterSize = 1;
00531     uint4 words, duplicates, iteration = 0, seed, wordLength;
00532         uint4 sinceChange, submatches, lookup;
00533     uint4 sequenceNumber, sequenceSize, offset, suboffset;
00534         unsigned char* sequence, *word;
00535         uint4 time;
00536     uint4 hashValue1, hashValue2, position, smallestPosition;
00537         uint4 windowHashes[decoQantum], inserted[decoQantum];
00538 
00539 //    int debug = 0;
00540 
00541     // Calculate size of hashcounter
00542     while (pow(2,hashCounterSize) < numberOfLetters / decoQantum)
00543     {
00544         hashCounterSize++;
00545     }
00546     hashCounter2 = hashcounter_new(hashCounterSize, 0);
00547 
00548     wordLength = decoWordLength - ((numIterations - 1) * decoQantum);
00549 
00550     while (iteration < numIterations)
00551     {
00552         time = -clock();
00553         words = 0; duplicates = 0;
00554         sinceChange = 0;
00555 
00556 //        if (debug)
00557 //        printf("Iteration %d wordLength %d\n", iteration, wordLength);
00558 
00559         // For each sequence
00560         sequenceNumber = 0;
00561         while (sequenceNumber < numberOfSequences)
00562         {
00563             sequence = sequences[sequenceNumber].sequence;
00564             sequenceSize = sequences[sequenceNumber].length;
00565 
00566             // For each sequence longer enough that hasn't been excluded
00567             if (sequenceSize >= wordLength)
00568             {
00569                 // For each word
00570                 offset = 0;
00571 
00572                 // For first few words
00573                 while (offset < decoQantum)
00574                 {
00575                         // Just add hash to the window
00576                     word = sequence + offset;
00577                     hashcounter_hash(hashCounter2, word, wordLength, hashValue2, position);
00578                                         windowHashes[offset] = hashValue2;
00579                     inserted[offset] = 0;
00580 
00581 //                    if (debug)
00582 //                    printf("New hash [%d] %u\n", offset, hashValue2);
00583 
00584                     offset++;
00585                 }
00586 
00587                 // For remaining words
00588                 while (offset < sequenceSize - wordLength)
00589                 {
00590                         // If not the first iteration
00591                     submatches = 0;
00592                         if (hashCounter1 != NULL)
00593                                 {
00594                         // Count number of sub-chunks that appear more than once
00595                         suboffset = offset;
00596                         while (suboffset <= offset + decoQantum)
00597                         {
00598                             word = sequence + suboffset;
00599                             hashcounter_hash(hashCounter1, word, wordLength - decoQantum, hashValue1, position);
00600                             if (hashcounter_multiple(hashCounter1, hashValue1))
00601                                 submatches++;
00602                             suboffset++;
00603                         }
00604                     }
00605 
00606                     // If this word may appear more than once in the collection
00607                     if (hashCounter1 == NULL || submatches >= 2)
00608                     {
00609                                                 // Hash the word and add to window
00610                         word = sequence + offset;
00611                         hashcounter_hash(hashCounter2, word, wordLength, hashValue2, position);
00612                         windowHashes[offset % decoQantum] = hashValue2;
00613                         inserted[offset % decoQantum] = 0;
00614 
00615 //                          if (debug)
00616 //                        printf("New hash [%d] %u (submatches=%d) winPos=%d\n", offset, hashValue2, submatches, offset % decoQantum);
00617 
00618                         // Search window for smallest hash value
00619                         position = 0; smallestPosition = 0;
00620                         while (position < decoQantum)
00621                         {
00622 //                                  if (debug)
00623 //                                                      printf("Old hash %u [%d/%d]\n", windowHashes[position], position, decoQantum);
00624 
00625                             if (windowHashes[position] < windowHashes[smallestPosition])
00626                                 smallestPosition = position;
00627 
00628                                 position++;
00629                         }
00630 
00631                         // If no smaller value, insert word into hash counter
00632                         if (!inserted[smallestPosition])
00633                         {
00634                                 inserted[smallestPosition] = 1;
00635 //                                  if (debug)
00636 //                            {
00637 //                                printf("Inserted %d! ", windowHashes[smallestPosition]);
00638 //                                print_singleSequence(sequence + offset, wordLength);
00639 //                                printf("\n");
00640 //                                                      }
00641                             hashcounter_insert(hashCounter2, hashValue2);
00642                                                 }
00643                     }
00644                     else
00645                     {
00646 //                          if (debug)
00647 //                      printf("Null hash\n");
00648                         // Otherwise insert placeholder into window
00649                         windowHashes[offset % decoQantum] = hashCounterSize;
00650                     }
00651 
00652                     offset++;
00653                 }
00654                         }
00655 
00656             sequenceNumber++;
00657         }
00658 
00659         if (hashCounter1)
00660                 hashcounter_free(hashCounter1);
00661                 hashCounter1 = hashCounter2;
00662         seed = rand();
00663 
00664         if (iteration != numIterations - 1)
00665                 hashCounter2 = hashcounter_new(hashCounterSize, 0);
00666 
00667         printf("Iteration %d WordLength=%d\n", iteration, wordLength); fflush(stdout);
00668         hashcounter_count(hashCounter1, stdout); fflush(stdout);
00669 //        printf("Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout);
00670 
00671         time += clock();
00672         printf("Iteration time=%.2f secs\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00673 
00674         iteration++;
00675         wordLength += decoQantum;
00676         }
00677 
00678     return hashCounter1;
00679 }

Here is the call graph for this function:

void rsdb_writeClusters ( char *  filename,
struct sequence sequences 
)

Definition at line 261 of file rsdb.c.

References parent::children, sequence::descriptionLength, sequence::descriptionLocation, descriptions_getDescription(), global_malloc(), sequence::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), parent::numChildren, sequence::parent, readdb_dbAlphabetType, readdb_numberOfSequences, rsdb_compareDescriptionLocations(), rsdb_parents, sequence::sequence, parent::sequence, uint4, writedb_addSequence(), writedb_close(), writedb_initialize(), and writedb_numberOfLetters.

Referenced by main().

00262 {
00263     struct parent* parent;
00264         char *newFilename, *wildcardsOutFilename, *renamedFilename;
00265     uint4 count, sequenceNumber, sequenceLength, descriptionLength, descriptionLocation;
00266         unsigned char *sequence, *description;
00267     uint4 childNum, numChildren, descriptionTotalLength, numWilds;
00268         struct child* children;
00269         struct wild* wilds, *wildcards;
00270 
00271         // Construct sequence and description filenames
00272         newFilename = (char*)global_malloc(strlen(filename) + 12);
00273         sprintf(newFilename, "%s_rsdb", filename);
00274 
00275     // Initialize writing to formatted database
00276     writedb_initialize(newFilename, readdb_dbAlphabetType);
00277 
00278     printf("Writing representative sequences to disk..."); fflush(stdout);
00279 
00280     // Sort sequences back to original order
00281     qsort(sequences, readdb_numberOfSequences, sizeof(struct sequence),
00282           rsdb_compareDescriptionLocations);
00283 
00284     // For each sequence without a parent or only child
00285     sequenceNumber = 0;
00286         while (sequenceNumber < readdb_numberOfSequences)
00287         {
00288         if (sequences[sequenceNumber].parent == NULL ||
00289             sequences[sequenceNumber].parent->numChildren == 1 ||
00290             sequences[sequenceNumber].parent->sequence == sequences[sequenceNumber].sequence)
00291         {
00292             sequenceLength = sequences[sequenceNumber].length;
00293             if (sequenceLength > 10)
00294             {
00295                 sequence = sequences[sequenceNumber].sequence;
00296                 descriptionLength = sequences[sequenceNumber].descriptionLength;
00297                 descriptionLocation = sequences[sequenceNumber].descriptionLocation;
00298 
00299                 // Load description from old collection
00300     //            description = sequences[sequenceNumber].description;
00301                 description = descriptions_getDescription(descriptionLocation, descriptionLength);
00302 
00303                 // Add sequence to the formatted collection
00304                 writedb_addSequence(sequence, sequenceLength, description,
00305                                     descriptionLength, NULL, 0, NULL, 0);
00306 
00307                 free(description);
00308                         }
00309         }
00310 
00311         // Free the sequence
00312 //        free(sequences[sequenceNumber].sequence);
00313 
00314         sequenceNumber++;
00315         }
00316 
00317     // For each parent
00318     memBlocks_resetCurrent(rsdb_parents);
00319         while ((parent = memBlocks_getCurrent(rsdb_parents)) != NULL)
00320     {
00321                 // Free the parent
00322         free(parent->children);
00323     }
00324 
00325     printf("done.\n"); fflush(stdout);
00326 
00327     printf("%llu letters written\n", writedb_numberOfLetters);
00328 
00329     // Finalize writing to the new formatted collection
00330     writedb_close();
00331 
00332 /*    // Rename clustered collection
00333     renamedFilename = (char*)global_malloc(strlen(filename) + 20);
00334         sprintf(renamedFilename, "%s.wildcards", filename);
00335     if (rename(wildcardsOutFilename, renamedFilename))
00336         fprintf(stderr, "Error renaming file %s to %s\n", wildcardsOutFilename, renamedFilename);
00337 
00338         sprintf(renamedFilename, "%s.sequences", filename);
00339     if (rename(writedb_sequenceFilename, renamedFilename))
00340         fprintf(stderr, "Error renaming file %s to %s\n", writedb_sequenceFilename, renamedFilename);
00341 
00342         sprintf(renamedFilename, "%s.descriptions", filename);
00343     if (rename(writedb_descriptionsFilename, renamedFilename))
00344         fprintf(stderr, "Error renaming file %s to %s\n", writedb_descriptionsFilename, renamedFilename);
00345 
00346         sprintf(renamedFilename, "%s.data", filename);
00347     if (rename(writedb_dataFilename, renamedFilename))
00348         fprintf(stderr, "Error renaming file %s to %s\n", writedb_dataFilename, renamedFilename);*/
00349 }

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

uint4 decoQantum = 9

Definition at line 20 of file rsdb.c.

uint4 decoWordLength = 30

Definition at line 18 of file rsdb.c.

uint4 maxListLength = 20

Definition at line 21 of file rsdb.c.

uint4 numIterations = 3

Definition at line 19 of file rsdb.c.

uint4 rsdb_accumulatorsSize = 0

Definition at line 83 of file rsdb.c.

Referenced by rsdb_spexClusterSequences(), and rsdb_updateDiagonalMatches().

float rsdb_averageMatchScore = 0

Definition at line 27 of file rsdb.c.

Referenced by main().

float* rsdb_averageWildcodeScores = NULL

Definition at line 28 of file rsdb.c.

Referenced by main().

uint4 rsdb_numMergeDiagonals

Definition at line 78 of file rsdb.c.

uint4 rsdb_numParents = 0

Definition at line 80 of file rsdb.c.

Referenced by rsdb_newParent().

struct memBlocks* rsdb_parents

Definition at line 79 of file rsdb.c.

Referenced by main(), rsdb_calculateSavings(), rsdb_spexClusterSequences(), and rsdb_writeClusters().

uint4** rsdb_PSSM

Definition at line 81 of file rsdb.c.

int4 rsdb_PSSMend

Definition at line 82 of file rsdb.c.

int4 rsdb_PSSMlength

Definition at line 82 of file rsdb.c.

int4 rsdb_PSSMstart

Definition at line 82 of file rsdb.c.

float wildcardsThreshold = 0.25

Definition at line 77 of file rsdb.c.


Generated on Wed Dec 19 20:59:58 2007 for fsa-blast by  doxygen 1.5.2