#include "blast.h"#include "vbyte2.h"#include "readindex.h"#include "vec.h"#include "vbyte2.c"#include "readindex.c"#include "vec.c"Include dependency graph for cluster.c:

Go to the source code of this file.
Data Structures | |
| struct | diagonal |
| struct | sequenceMatch |
| struct | sequence |
| struct | parent |
| struct | wildCode |
Defines | |
| #define | decoWordLength 30 |
| #define | numIterations 3 |
| #define | decoQantum 9 |
| #define | maxListLength 100 |
| #define | wildcardSumWindow 100 |
| #define | percentMatchesRequired 15 |
Functions | |
| void | cluster_printSequence (uint4 whitespace, unsigned char *sequence, uint4 length) |
| void | cluster_processSequenceMatches (struct memSingleBlock *sequenceMatches, struct sequence *sequences) |
| void | cluster_simpleClusterSequences (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters) |
| void | cluster_clusterSequences (struct sequence *sequences, uint4 numSequences, uint4 numberOfLetters, char *filename) |
| int4 | cluster_compareScore (const void *sequenceMatch1, const void *sequenceMatch2) |
| void | cluster_processList (struct memSingleBlock *diagonals, struct wordLocation *wordLocations, uint4 numWordLocations) |
| void | cluster_printCluster (struct parent *parent) |
| uint4 | cluster_score (struct sequence *seq1, struct sequence *seq2, int4 relativeOffset, float wildscoreAllowed) |
| uint4 | cluster_overlap (struct sequence *sequence1, struct sequence *sequence2, int4 relativeOffset) |
| void | cluster_newParent (struct parent *newParent, struct sequence *child) |
| void | cluster_addChild (struct parent *parent, struct sequence *child, int4 relativeOffset) |
| void | cluster_mergeParents (struct parent *parent1, struct parent *parent2, int4 relativeOffset) |
| uint4 | cluster_checkMerge (struct wordLocation *wordLocations1, uint4 numWordLocations1, struct wordLocation *wordLocations2, uint4 numWordLocations2, int4 *bestDiagonal) |
| wordLocation * | cluster_mergeLists (struct wordLocation *wordLocations1, uint4 *numWordLocations1, struct wordLocation *wordLocations2, uint4 *numWordLocations2, int4 mergeRelativeOffset, uint *lengthNewList) |
| void | cluster_printList (struct wordLocation *list, uint4 length) |
| int | cluster_buildCluster (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences) |
| uint4 | cluster_removeChildren (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences) |
| int4 | cluster_calculateSavings () |
| void | cluster_writeClusters (char *filename, struct sequence *sequences) |
| edit * | cluster_getEdits (struct parent *parent, struct sequence *child, uint4 *numEdits) |
| int | cluster_buildPSSM (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences) |
| float | cluster_averageWildcodeScore (uint4 wildcode) |
| void | cluster_freePSSM () |
| void | cluster_spexClusterSequences (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters) |
| int4 | main (int4 argc, char *argv[]) |
| int4 | cluster_compareSequenceLengths (const void *sequence1, const void *sequence2) |
| void | cluster_updateWildcodes (struct parent *parent, int4 change) |
| uint4 | cluster_getWildcode (struct parent *parent, uint4 position) |
| void | cluster_updateWildcode (struct parent *parent, uint4 position, uint4 wildCode) |
Variables | |
| float | cluster_averageMatchScore = 0 |
| float * | cluster_averageWildcodeScores = NULL |
| float | wildcardsThreshold = 0.25 |
| uint4 | cluster_numMergeDiagonals |
| memBlocks * | cluster_parents |
| uint4 | cluster_numParents = 0 |
| uint4 ** | cluster_PSSM |
| int4 | cluster_PSSMstart |
| int4 | cluster_PSSMend |
| int4 | cluster_PSSMlength |
| #define decoQantum 9 |
Definition at line 19 of file cluster.c.
Referenced by cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), and rsdb_winnowingSpex().
| #define decoWordLength 30 |
Definition at line 17 of file cluster.c.
Referenced by cluster_clusterSequences(), cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), rsdb_spexClusterSequences(), and rsdb_winnowingSpex().
| #define maxListLength 100 |
Definition at line 20 of file cluster.c.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), cluster_spexClusterSequences(), main(), and rsdb_spexClusterSequences().
| #define numIterations 3 |
Definition at line 18 of file cluster.c.
Referenced by cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), and rsdb_winnowingSpex().
| #define percentMatchesRequired 15 |
Definition at line 22 of file cluster.c.
Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().
| #define wildcardSumWindow 100 |
Definition at line 1771 of file cluster.c.
References parent::children, cluster_getWildcode(), cluster_updateWildcode(), cluster_updateWildcodes(), encoding_aaStartWildcards, getbit, global_malloc(), global_realloc(), int4, sequence::length, parent::length, parent::numChildren, sequence::parent, sequence::regionStart, sequence::sequence, parent::sequence, setbit, wildcards_clusterWildcards, wildcards_numClusterWildcards, and wildcards_printWildcard().
Referenced by cluster_buildCluster(), cluster_mergeParents(), and cluster_processSequenceMatches().
01772 { 01773 int4 childNum, count, wildCode, wildcardCount; 01774 unsigned char* newParentSequence; 01775 01776 if (relativeOffset < 0) 01777 { 01778 // Child starts before parent, add new sequence at beginning of parent 01779 newParentSequence = (char*)global_malloc(parent->length - relativeOffset); 01780 memcpy(newParentSequence - relativeOffset, parent->sequence, parent->length); 01781 memcpy(newParentSequence, child->sequence, -relativeOffset); 01782 free(parent->sequence); 01783 parent->sequence = newParentSequence; 01784 parent->length -= relativeOffset; 01785 01786 // Update children coordinates 01787 childNum = 0; 01788 while (childNum < parent->numChildren) 01789 { 01790 parent->children[childNum]->regionStart -= relativeOffset; 01791 childNum++; 01792 } 01793 01794 // Update wildcodes 01795 cluster_updateWildcodes(parent, -relativeOffset); 01796 01797 relativeOffset = 0; 01798 } 01799 01800 // If child is longer than parent 01801 if (child->length + relativeOffset > parent->length) 01802 { 01803 // Add end of new sequence onto parent 01804 parent->sequence = (char*)global_realloc(parent->sequence, child->length + relativeOffset); 01805 memcpy(parent->sequence + parent->length, child->sequence + parent->length - relativeOffset, 01806 child->length - parent->length + relativeOffset); 01807 parent->length = child->length + relativeOffset; 01808 } 01809 01810 // printf("[%d:%d:%d]\n", child->length, parent->length, relativeOffset); 01811 // For region that child and parent have in common 01812 count = 0; 01813 while (count < child->length) 01814 { 01815 // printf("[%d/%d]", count + relativeOffset, parent->length); fflush(stdout); 01816 // If characters don't match 01817 if (child->sequence[count] != parent->sequence[count + relativeOffset]) 01818 { 01819 // printf("%d: %d,%d\n", count + relativeOffset, child->sequence[count], 01820 // parent->sequence[count + relativeOffset]); fflush(stdout); 01821 01822 if (parent->sequence[count + relativeOffset] >= encoding_aaStartWildcards) 01823 { 01824 // Get parent wildcode 01825 wildCode = cluster_getWildcode(parent, count + relativeOffset); 01826 } 01827 else 01828 { 01829 // Add parent character to new wildcode 01830 wildCode = 0; 01831 setbit(wildCode, parent->sequence[count + relativeOffset]); 01832 } 01833 01834 // If child residue will change wildcode 01835 if (!getbit(wildCode, child->sequence[count])) 01836 { 01837 // Add child character to wildcode 01838 setbit(wildCode, child->sequence[count]); 01839 cluster_updateWildcode(parent, count + relativeOffset, wildCode); 01840 01841 // printf("set %d pos %d = %d\n", parent->number, count + relativeOffset, wildCode); 01842 01843 // Check for a match in set of wildcards 01844 wildcardCount = 0; 01845 while (wildcardCount < wildcards_numClusterWildcards) 01846 { 01847 if ((wildcards_clusterWildcards[wildcardCount].wildCode & wildCode) == wildCode) 01848 { 01849 // printf("Match: "); 01850 // wildcards_printWildcard(wildcards_clusterWildcards[wildcardCount].wildCode); 01851 break; 01852 } 01853 wildcardCount++; 01854 } 01855 01856 if (wildcardCount >= wildcards_numClusterWildcards) 01857 { 01858 printf("Error %d >= %d\n", wildcardCount, wildcards_numClusterWildcards); 01859 wildcards_printWildcard(wildCode); 01860 exit(-1); 01861 } 01862 01863 // Insert wildcard character into parent 01864 parent->sequence[count + relativeOffset] = encoding_aaStartWildcards + wildcardCount; 01865 } 01866 01867 // printf("Inserted wildNum %d into parent\n", wildcardCount); 01868 } 01869 count++; 01870 } 01871 01872 // Add new child 01873 child->regionStart = relativeOffset; 01874 child->parent = parent; 01875 parent->numChildren++; 01876 parent->children = (struct sequence**)global_realloc(parent->children, 01877 sizeof(struct sequence*) * parent->numChildren); 01878 parent->children[parent->numChildren - 1] = child; 01879 01880 // printf("Added %d: \n", child->number); 01881 // cluster_printCluster(parent); 01882 }
Here is the call graph for this function:

Here is the caller graph for this function:

| float cluster_averageWildcodeScore | ( | uint4 | wildcode | ) |
Definition at line 2091 of file cluster.c.
References cluster_averageWildcodeScores, constants_sentinalScore, encoding_numLetters, getbit, global_malloc(), int4, scoreMatrix::matrix, Robinson_prob, and wildcards_scoreMatrix.
02092 { 02093 unsigned char code, wildResidue; 02094 float averageScore = 0; 02095 int4 bestScore, tempCode; 02096 02097 // Initialize array to hold precomputed answers 02098 if (cluster_averageWildcodeScores == NULL) 02099 { 02100 cluster_averageWildcodeScores = (float*)global_malloc(sizeof(float) 02101 * ceil(pow(2, encoding_numLetters))); 02102 tempCode = 0; 02103 while (tempCode < ceil(pow(2, encoding_numLetters))) 02104 { 02105 cluster_averageWildcodeScores[tempCode] = 0; 02106 tempCode++; 02107 } 02108 } 02109 02110 if (cluster_averageWildcodeScores[wildCode] == 0) 02111 { 02112 // For each residue 02113 code = 0; 02114 while (code < encoding_numLetters) 02115 { 02116 // For each residue in wildcode 02117 bestScore = constants_sentinalScore; 02118 wildResidue = 0; 02119 while (wildResidue < encoding_numLetters) 02120 { 02121 if (getbit(wildCode, wildResidue)) 02122 { 02123 if (wildcards_scoreMatrix.matrix[code][wildResidue] > bestScore) 02124 bestScore = wildcards_scoreMatrix.matrix[code][wildResidue]; 02125 } 02126 wildResidue++; 02127 } 02128 02129 // printf("averageScore=%f\n", (float)bestScore); 02130 averageScore += (float)bestScore * (Robinson_prob[code] / 1000.0); 02131 02132 code++; 02133 } 02134 02135 cluster_averageWildcodeScores[wildCode] = averageScore; 02136 } 02137 02138 return cluster_averageWildcodeScores[wildCode]; 02139 }
Here is the call graph for this function:

| int cluster_buildCluster | ( | struct wordLocation * | wordLocations, | |
| uint4 | numWordLocations, | |||
| struct sequence * | sequences | |||
| ) |
Definition at line 1291 of file cluster.c.
References cluster_addChild(), cluster_newParent(), cluster_parents, cluster_PSSM, cluster_score(), global_malloc(), int4, sequence::length, memBlocks_newEntry(), wordLocation::offset, sequence::regionStart, parent::sequence, sequence::sequence, wordLocation::sequenceNumber, uint4, and wildcardsThreshold.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01293 { 01294 uint4 *toCluster, numToCluster, *toCluster2, numToCluster2, *toCluster3; 01295 int4 start, end, column; 01296 int4 score, bestScore, bestLocation; 01297 struct sequence sequence; 01298 uint4 locationNum; 01299 struct parent* parent; 01300 struct wordLocation firstMember; 01301 01302 // printf("cluster_buildCluster["); fflush(stdout); 01303 01304 // Compare sequences to the PSSM 01305 bestScore = 0; bestLocation = 0; 01306 locationNum = 0; 01307 while (locationNum < numWordLocations) 01308 { 01309 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL) 01310 { 01311 // For each sequence 01312 sequence = sequences[wordLocations[locationNum].sequenceNumber]; 01313 01314 // Find start and end 01315 start = -wordLocations[locationNum].offset; 01316 end = sequence.length - wordLocations[locationNum].offset; 01317 01318 // Score sequence against PSSM 01319 score = 0; 01320 column = start; 01321 while (column < end) 01322 { 01323 score += cluster_PSSM[column][sequence.sequence[column - start]]; 01324 column++; 01325 } 01326 01327 // Check if this sequence is most similiar to PSSM 01328 score = 100 * score / sequence.length; 01329 if (score > bestScore) 01330 { 01331 bestLocation = locationNum; 01332 bestScore = score; 01333 } 01334 01335 // printf("[%3d] ", score); 01336 // cluster_printSequence(start - cluster_PSSMstart, sequence.sequence, sequence.length); 01337 } 01338 01339 locationNum++; 01340 } 01341 01342 // Closest to the consensus becomes the first member of the new cluster 01343 parent = (struct parent*)memBlocks_newEntry(cluster_parents); 01344 cluster_newParent(parent, sequences + wordLocations[bestLocation].sequenceNumber); 01345 firstMember = wordLocations[bestLocation]; 01346 01347 // Remove first member from PSSM 01348 sequence = sequences[wordLocations[bestLocation].sequenceNumber]; 01349 start = -wordLocations[bestLocation].offset; 01350 end = sequence.length - wordLocations[bestLocation].offset; 01351 column = start; 01352 while (column < end) 01353 { 01354 cluster_PSSM[column][sequence.sequence[column - start]]--; 01355 column++; 01356 } 01357 01358 toCluster = (uint4*)global_malloc(sizeof(uint4) * numWordLocations); 01359 toCluster2 = (uint4*)global_malloc(sizeof(uint4) * numWordLocations); 01360 numToCluster = 0; numToCluster2 = 0; 01361 01362 // Initially consider every sequence 01363 locationNum = 0; 01364 while (locationNum < numWordLocations) 01365 { 01366 // If not already a member of a cluster 01367 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL) 01368 { 01369 // Add to list to align to new cluster's parent 01370 toCluster[numToCluster] = locationNum; 01371 numToCluster++; 01372 } 01373 locationNum++; 01374 } 01375 01376 // printf("!"); fflush(stdout); 01377 01378 // Repeat until no more sequences are similiar enough to the parent 01379 while (numToCluster > 0) 01380 { 01381 // For each sequence still worth considering 01382 bestScore = 0; bestLocation = 0; 01383 // printf("%d to cluster\n", numToCluster); 01384 while (numToCluster > 0) 01385 { 01386 numToCluster--; 01387 locationNum = toCluster[numToCluster]; 01388 01389 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL) 01390 { 01391 // Get sequence, start and end 01392 sequence = sequences[wordLocations[locationNum].sequenceNumber]; 01393 start = -wordLocations[locationNum].offset; 01394 end = sequence.length - wordLocations[locationNum].offset; 01395 01396 // Calculate score for aligning sequence to parent of cluster 01397 score = cluster_score(sequences + firstMember.sequenceNumber, &sequence, 01398 firstMember.offset - wordLocations[locationNum].offset, 01399 wildcardsThreshold); 01400 01401 // If above cutoff 01402 if (score > 0) 01403 { 01404 // Add to list 01405 toCluster2[numToCluster2] = locationNum; 01406 numToCluster2++; 01407 01408 // Calculate highest scoring match 01409 if (score > bestScore) 01410 { 01411 bestScore = score; 01412 bestLocation = locationNum; 01413 } 01414 01415 // cluster_printSequence(start - cluster_PSSMstart, sequence.sequence, sequence.length); 01416 // printf("Score=%d\n", score); 01417 } 01418 } 01419 } 01420 01421 // If we have matches to our close-to-consensus sequence 01422 if (bestScore > 0) 01423 { 01424 // Add the closest match to the new cluster 01425 cluster_addChild(parent, sequences + wordLocations[bestLocation].sequenceNumber, 01426 firstMember.offset - wordLocations[bestLocation].offset + 01427 sequences[firstMember.sequenceNumber].regionStart); 01428 01429 // Remove new member from PSSM 01430 sequence = sequences[wordLocations[bestLocation].sequenceNumber]; 01431 start = -wordLocations[bestLocation].offset; 01432 end = sequence.length - wordLocations[bestLocation].offset; 01433 column = start; 01434 while (column < end) 01435 { 01436 cluster_PSSM[column][sequence.sequence[column - start]]--; 01437 column++; 01438 } 01439 01440 // Switch clusters so processing new list of close matches to parent 01441 toCluster3 = toCluster; 01442 toCluster = toCluster2; 01443 toCluster2 = toCluster3; 01444 numToCluster = numToCluster2; 01445 numToCluster2 = 0; 01446 } 01447 01448 // cluster_printCluster(parent); 01449 } 01450 01451 // cluster_printCluster(parent); 01452 01453 free(toCluster); 01454 free(toCluster2); 01455 01456 // printf("]\n"); fflush(stdout); 01457 01458 return 0; 01459 }
Here is the call graph for this function:

Here is the caller graph for this function:

| int cluster_buildPSSM | ( | struct wordLocation * | wordLocations, | |
| uint4 | numWordLocations, | |||
| struct sequence * | sequences | |||
| ) |
Definition at line 1206 of file cluster.c.
References cluster_PSSM, cluster_PSSMend, cluster_PSSMlength, cluster_PSSMstart, encoding_numCodes, global_malloc(), int4, sequence::length, wordLocation::offset, sequence::sequence, wordLocation::sequenceNumber, and uint4.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01208 { 01209 int4 start, end, column; 01210 uint4 locationNum; 01211 unsigned char code; 01212 struct sequence sequence; 01213 01214 cluster_PSSMstart = 0; 01215 cluster_PSSMend = 0; 01216 01217 // Calculate the start and end of consensus region 01218 locationNum = 0; 01219 while (locationNum < numWordLocations) 01220 { 01221 // Check sequences do not already have parents 01222 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL) 01223 { 01224 start = -wordLocations[locationNum].offset; 01225 if (start < cluster_PSSMstart) 01226 cluster_PSSMstart = -wordLocations[locationNum].offset; 01227 01228 end = sequences[wordLocations[locationNum].sequenceNumber].length 01229 - wordLocations[locationNum].offset; 01230 if (end > cluster_PSSMend) 01231 cluster_PSSMend = end; 01232 } 01233 01234 locationNum++; 01235 } 01236 01237 if (cluster_PSSMstart == 0 && cluster_PSSMend == 0) 01238 return 1; 01239 01240 // printf("consensus start=%d end=%d\n", cluster_PSSMstart, cluster_PSSMend); 01241 01242 // Initialize the PSSM 01243 cluster_PSSMlength = cluster_PSSMend - cluster_PSSMstart; 01244 cluster_PSSM = (uint4**)global_malloc(sizeof(uint4*) * cluster_PSSMlength); 01245 cluster_PSSM -= cluster_PSSMstart; 01246 column = cluster_PSSMstart; 01247 while (column < cluster_PSSMend) 01248 { 01249 // Initialize each column to zero values 01250 cluster_PSSM[column] = (uint4*)global_malloc(sizeof(uint4) * encoding_numCodes); 01251 code = 0; 01252 while (code < encoding_numCodes) 01253 { 01254 cluster_PSSM[column][code] = 0; 01255 code++; 01256 } 01257 column++; 01258 } 01259 01260 // Calculate residue frequencies at each column 01261 locationNum = 0; 01262 while (locationNum < numWordLocations) 01263 { 01264 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL) 01265 { 01266 // For each sequence 01267 sequence = sequences[wordLocations[locationNum].sequenceNumber]; 01268 01269 // Find start and end 01270 start = -wordLocations[locationNum].offset; 01271 end = sequence.length - wordLocations[locationNum].offset; 01272 01273 // Process each residue in the sequence and update count in PSSM 01274 column = 0; 01275 while (column < sequence.length) 01276 { 01277 // printf("[%d]", sequence.sequence[column]); fflush(stdout); 01278 cluster_PSSM[column + start][sequence.sequence[column]]++; 01279 column++; 01280 } 01281 } 01282 01283 locationNum++; 01284 } 01285 01286 return 0; 01287 }
Here is the call graph for this function:

Here is the caller graph for this function:

| int4 cluster_calculateSavings | ( | ) |
Definition at line 1533 of file cluster.c.
References parent::children, cluster_parents, encoding_aaStartWildcards, int4, sequence::length, parent::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), sequence::number, parent::numChildren, sequence::parent, and parent::sequence.
Referenced by main().
01534 { 01535 int4 count, childNum, numCharsSaved, bytesSaved = 0, numWilds; 01536 struct sequence* child; 01537 struct parent* parent; 01538 01539 // For each parent 01540 memBlocks_resetCurrent(cluster_parents); 01541 while ((parent = memBlocks_getCurrent(cluster_parents)) != NULL) 01542 { 01543 count = 0; numWilds = 0; 01544 while (count < parent->length) 01545 { 01546 if (parent->sequence[count] >= encoding_aaStartWildcards) 01547 numWilds++; 01548 count++; 01549 } 01550 01551 // If it has children 01552 if (parent->children != NULL) 01553 { 01554 // For each child calculate number of characters saved 01555 numCharsSaved = 0; 01556 childNum = 0; 01557 while (childNum < parent->numChildren) 01558 { 01559 child = parent->children[childNum]; 01560 01561 // CHECK: no child has two parents 01562 if (child->parent != parent) 01563 { 01564 printf("Error sequence %d has multiple parents\n", child->number); 01565 exit(-1); 01566 } 01567 01568 numCharsSaved += child->length; 01569 childNum++; 01570 } 01571 01572 // Minus cost of parent 01573 bytesSaved += numCharsSaved - parent->length; 01574 01575 if (numCharsSaved < parent->length) 01576 { 01577 printf("Error parent length %d with %d children saves negative bytes", 01578 parent->length, parent->numChildren); 01579 exit(-1); 01580 } 01581 } 01582 } 01583 01584 return bytesSaved; 01585 }
Here is the call graph for this function:

Here is the caller graph for this function:

| uint4 cluster_checkMerge | ( | struct wordLocation * | wordLocations1, | |
| uint4 | numWordLocations1, | |||
| struct wordLocation * | wordLocations2, | |||
| uint4 | numWordLocations2, | |||
| int4 * | bestDiagonal | |||
| ) |
| void cluster_clusterSequences | ( | struct sequence * | sequences, | |
| uint4 | numSequences, | |||
| uint4 | numberOfLetters, | |||
| char * | filename | |||
| ) |
Definition at line 870 of file cluster.c.
References close_index(), cluster_buildCluster(), cluster_buildPSSM(), cluster_freePSSM(), cluster_overlap(), cluster_processList(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), decoWordLength, listinfo::doc_count, listinfo::doc_numbers, get_next_biglist(), get_next_list(), global_malloc(), int4, diagonal::matchSequence, maxListLength, memSingleBlock_free(), memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_initializeExisting(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, diagonal::numMatches, wordLocation::offset, open_index(), percentMatchesRequired, listinfo::phrase_offsets, diagonal::relativeOffset, reset_index(), wordLocation::sequenceNumber, uint4, and wildcardsThreshold.
00872 { 00873 struct wordLocation* wordLocations; 00874 uint4 numWordLocations; 00875 int4 relativeOffset, sequenceNumber; 00876 uint4 sequence1, sequence2, score; 00877 struct memSingleBlock* diagonals, *sequenceDiagonals; 00878 struct diagonal* diagonalMatch; 00879 struct sequenceMatch *sequenceMatch; 00880 struct memSingleBlock* sequenceMatches; 00881 struct sequence* seq1, *seq2; 00882 struct index_scanner *index_scanner; 00883 struct listinfo* listinfo = NULL; 00884 uint4 offsetCount, sequenceCount; 00885 00886 // Initialize diagonals 00887 diagonals = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock) 00888 * numberOfSequences); 00889 sequenceNumber = 0; 00890 while (sequenceNumber < numberOfSequences) 00891 { 00892 memSingleBlock_initializeExisting(diagonals + sequenceNumber, sizeof(struct diagonal), 1); 00893 sequenceNumber++; 00894 } 00895 00896 printf("Initialized diagonals\n"); fflush(stdout); 00897 00898 // Temporary buffer for storing word locations 00899 wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences); 00900 00901 index_scanner = open_index(filename); 00902 00903 // First process the longest lists 00904 while ((listinfo = get_next_biglist(index_scanner, listinfo)) != NULL && 00905 listinfo->doc_count >= maxListLength) 00906 { 00907 printf("doc_count=%ld\n", listinfo->doc_count); fflush(stdout); 00908 00909 // For each sequence 00910 numWordLocations = 0; 00911 sequenceCount = 0; 00912 while (sequenceCount < listinfo->doc_count) 00913 { 00914 // Only consider first occurence in that sequence 00915 offsetCount = 0; 00916 00917 wordLocations[numWordLocations].sequenceNumber 00918 = listinfo->doc_numbers[sequenceCount] - 1; 00919 wordLocations[numWordLocations].offset 00920 = listinfo->phrase_offsets[sequenceCount][offsetCount] - 1; 00921 00922 numWordLocations++; 00923 offsetCount++; 00924 00925 sequenceCount++; 00926 } 00927 00928 printf("Was %d\n", numWordLocations); fflush(stdout); 00929 00930 // Remove any locations refering to already clustered sequences 00931 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences); 00932 00933 if (numWordLocations >= maxListLength && 00934 !cluster_buildPSSM(wordLocations, numWordLocations, sequences)) 00935 { 00936 while (numWordLocations >= maxListLength) 00937 { 00938 // Build cluster from longest list of word locations 00939 cluster_buildCluster(wordLocations, numWordLocations, sequences); 00940 00941 // cluster_printList(wordLocations, numWordLocations); 00942 00943 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences); 00944 printf("Now %d\n", numWordLocations); fflush(stdout); 00945 } 00946 00947 // Free the PSSM 00948 cluster_freePSSM(); 00949 } 00950 00951 // Process a list of word locations and insert/update diagonal entries 00952 cluster_processList(diagonals, wordLocations, numWordLocations); 00953 } 00954 00955 // Process remaining shorter lists 00956 index_scanner = reset_index(index_scanner); 00957 while ((listinfo = get_next_list(index_scanner, listinfo)) != NULL) 00958 { 00959 if (listinfo->doc_count < maxListLength) 00960 { 00961 // For each sequence 00962 numWordLocations = 0; 00963 sequenceCount = 0; 00964 while (sequenceCount < listinfo->doc_count) 00965 { 00966 // Only consider first occurence in that sequence 00967 offsetCount = 0; 00968 00969 wordLocations[numWordLocations].sequenceNumber 00970 = listinfo->doc_numbers[sequenceCount] - 1; 00971 wordLocations[numWordLocations].offset 00972 = listinfo->phrase_offsets[sequenceCount][offsetCount] - 1; 00973 00974 offsetCount++; 00975 numWordLocations++; 00976 00977 sequenceCount++; 00978 } 00979 00980 // printf("cluster_processList with %d entries\n", numWordLocations); 00981 // Process a list of word locations and insert/update diagonal entries 00982 cluster_processList(diagonals, wordLocations, numWordLocations); 00983 } 00984 } 00985 close_index(index_scanner); 00986 00987 free(wordLocations); 00988 00989 printf("END STAGE 2\n"); fflush(stdout); 00990 00991 sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences); 00992 00993 printf("\n"); 00994 // For each sequence 00995 sequenceNumber = 0; 00996 while (sequenceNumber < numberOfSequences) 00997 { 00998 // Go through list of matching diagonals 00999 sequenceDiagonals = diagonals + sequenceNumber; 01000 memSingleBlock_resetCurrent(sequenceDiagonals); 01001 while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL) 01002 { 01003 sequence1 = sequenceNumber; 01004 sequence2 = diagonalMatch->matchSequence; 01005 relativeOffset = diagonalMatch->relativeOffset; 01006 seq1 = sequences + sequence1; 01007 seq2 = sequences + sequence2; 01008 01009 // Calculate rough score based on number of matches, length of overlap region 01010 score = 100 * diagonalMatch->numMatches / 01011 cluster_overlap(seq1, seq2, relativeOffset) + decoWordLength; 01012 01013 if (score > percentMatchesRequired)// && relativeOffset == 0 && seq1->length == seq2->length) 01014 { 01015 // Compare the two sequences in detail 01016 score = cluster_score(seq1, seq2, relativeOffset, wildcardsThreshold); 01017 if (score > 0) 01018 { 01019 // printf("Sequences %d,%d lengths %d,%d offset %d matches=%d score=%d\n", 01020 // sequence1, sequence2, sequences[sequence1].length, sequences[sequence2].length, 01021 // relativeOffset, diagonalMatch->numMatches, score); 01022 01023 // Record high-scoring match between pair of sequences 01024 sequenceMatch = memSingleBlock_newEntry(sequenceMatches); 01025 sequenceMatch->sequence1 = sequence1; 01026 sequenceMatch->sequence2 = sequence2; 01027 sequenceMatch->relativeOffset = relativeOffset; 01028 sequenceMatch->score = score; 01029 } 01030 } 01031 01032 } 01033 sequenceNumber++; 01034 } 01035 01036 printf("END STAGE 3 numSequenceMatches=%d\n", sequenceMatches->numEntries); fflush(stdout); 01037 01038 // Free memory used to store diagonal matches 01039 sequenceNumber = 0; 01040 while (sequenceNumber < numberOfSequences) 01041 { 01042 free(diagonals[sequenceNumber].block); 01043 sequenceNumber++; 01044 } 01045 free(diagonals); 01046 01047 // Process sequence matches 01048 cluster_processSequenceMatches(sequenceMatches, sequences); 01049 01050 memSingleBlock_free(sequenceMatches); 01051 }
Here is the call graph for this function:

| int4 cluster_compareScore | ( | const void * | sequenceMatch1, | |
| const void * | sequenceMatch2 | |||
| ) |
Definition at line 1885 of file cluster.c.
References sequenceMatch::score.
Referenced by cluster_processSequenceMatches().
01886 { 01887 const struct sequenceMatch *a1, *a2; 01888 01889 a1 = (struct sequenceMatch*)sequenceMatch1; 01890 a2 = (struct sequenceMatch*)sequenceMatch2; 01891 01892 if (a1->score > a2->score) 01893 { 01894 return -1; 01895 } 01896 else if (a1->score < a2->score) 01897 { 01898 return 1; 01899 } 01900 else 01901 { 01902 return 0; 01903 } 01904 }
Here is the caller graph for this function:

| int4 cluster_compareSequenceLengths | ( | const void * | sequence1, | |
| const void * | sequence2 | |||
| ) |
Definition at line 432 of file cluster.c.
References sequence::length.
Referenced by cluster_simpleClusterSequences().
00433 { 00434 const struct sequence *a1, *a2; 00435 00436 a1 = (struct sequence*)sequence1; 00437 a2 = (struct sequence*)sequence2; 00438 00439 if (a1->length > a2->length) 00440 { 00441 return 1; 00442 } 00443 else if (a1->length < a2->length) 00444 { 00445 return -1; 00446 } 00447 else 00448 { 00449 return 0; 00450 } 00451 }
Here is the caller graph for this function:

| void cluster_freePSSM | ( | ) |
Definition at line 1462 of file cluster.c.
References cluster_PSSM, cluster_PSSMend, cluster_PSSMstart, and int4.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01463 { 01464 int4 column; 01465 01466 column = cluster_PSSMstart; 01467 while (column < cluster_PSSMend) 01468 { 01469 free(cluster_PSSM[column]); 01470 column++; 01471 } 01472 cluster_PSSM += cluster_PSSMstart; 01473 free(cluster_PSSM); 01474 }
Here is the caller graph for this function:

| struct edit * cluster_getEdits | ( | struct parent * | parent, | |
| struct sequence * | child, | |||
| uint4 * | numEdits | |||
| ) | [read] |
Definition at line 409 of file cluster.c.
References edit::code, global_malloc(), sequence::length, edit::position, sequence::regionStart, sequence::sequence, parent::sequence, and uint4.
Referenced by cluster_writeClusters().
00410 { 00411 uint4 count = 0; 00412 struct edit* edits; 00413 00414 *numEdits = 0; 00415 edits = (struct edit*)global_malloc(sizeof(struct edit) * child->length); 00416 00417 while (count < child->length) 00418 { 00419 if (parent->sequence[count + child->regionStart] != child->sequence[count]) 00420 { 00421 edits[*numEdits].position = count; 00422 edits[*numEdits].code = child->sequence[count]; 00423 (*numEdits)++; 00424 } 00425 count++; 00426 } 00427 00428 return edits; 00429 }
Here is the call graph for this function:

Here is the caller graph for this function:

| uint4 cluster_getWildcode | ( | struct parent * | parent, | |
| uint4 | position | |||
| ) |
Definition at line 1676 of file cluster.c.
References parent::number, sequence::parent, wildCode::position, uint4, wildCode::wildCode, and parent::wildCodes.
Referenced by cluster_addChild(), and cluster_score().
01677 { 01678 uint4 count = 0; 01679 01680 while (count < parent->numWildcards) 01681 { 01682 if (parent->wildCodes[count].position == position) 01683 return parent->wildCodes[count].wildCode; 01684 count++; 01685 } 01686 01687 fprintf(stderr, "Internal error cluster_getWildcode(%d, %d)\n", parent->number, position); 01688 exit(-1); 01689 }
Here is the caller graph for this function:

| struct wordLocation* cluster_mergeLists | ( | struct wordLocation * | wordLocations1, | |
| uint4 * | numWordLocations1, | |||
| struct wordLocation * | wordLocations2, | |||
| uint4 * | numWordLocations2, | |||
| int4 | mergeRelativeOffset, | |||
| uint * | lengthNewList | |||
| ) | [read] |
Definition at line 1717 of file cluster.c.
References parent::children, cluster_addChild(), cluster_updateWildcode(), global_realloc(), int4, parent::length, parent::numChildren, parent::numWildcards, wildCode::position, sequence::regionStart, parent::sequence, uint4, wildCode::wildCode, and parent::wildCodes.
Referenced by cluster_processSequenceMatches().
01718 { 01719 int4 childNum; 01720 struct sequence* child; 01721 struct parent* temp; 01722 uint4 wildcodeCount = 0; 01723 01724 // Arrange so that parent1 comes before parent2 01725 if (relativeOffset < 0) 01726 { 01727 temp = parent1; 01728 parent1 = parent2; 01729 parent2 = temp; 01730 relativeOffset = -relativeOffset; 01731 } 01732 01733 // If parent2 is longer than parent1 01734 if (parent2->length + relativeOffset > parent1->length) 01735 { 01736 // Add end of parent2 onto parent1 01737 parent1->sequence = (char*)global_realloc(parent1->sequence, parent2->length + relativeOffset); 01738 memcpy(parent1->sequence + parent1->length, parent2->sequence + parent1->length - relativeOffset, 01739 parent2->length - parent1->length + relativeOffset); 01740 parent1->length = parent2->length + relativeOffset; 01741 01742 while (wildcodeCount < parent2->numWildcards) 01743 { 01744 cluster_updateWildcode(parent1, parent2->wildCodes[wildcodeCount].position + relativeOffset, 01745 parent2->wildCodes[wildcodeCount].wildCode); 01746 wildcodeCount++; 01747 } 01748 } 01749 01750 // For each child in parent2 01751 childNum = 0; 01752 while (childNum < parent2->numChildren) 01753 { 01754 // Add child to parent1 01755 child = parent2->children[childNum]; 01756 cluster_addChild(parent1, child, relativeOffset + child->regionStart); 01757 01758 childNum++; 01759 } 01760 01761 // parent2 no longer has children 01762 free(parent2->children); 01763 parent2->children = NULL; 01764 free(parent2->sequence); 01765 parent2->sequence = NULL; 01766 parent2->length = 0; 01767 parent2->numChildren = 0; 01768 }
Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 1644 of file cluster.c.
References parent::children, cluster_numParents, global_malloc(), sequence::length, parent::length, parent::number, parent::numChildren, parent::numWildcards, sequence::parent, sequence::regionStart, sequence::sequence, parent::sequence, and parent::wildCodes.
Referenced by cluster_buildCluster(), and cluster_processSequenceMatches().
01645 { 01646 // Make copy of the child 01647 parent->number = cluster_numParents; cluster_numParents++; 01648 parent->length = child->length; 01649 parent->sequence = (char*)global_malloc(parent->length); 01650 memcpy(parent->sequence, child->sequence, child->length); 01651 01652 // Initialize list of wildcards 01653 parent->wildCodes = NULL; 01654 parent->numWildcards = 0; 01655 01656 // Link parent to child, and vice versa 01657 parent->numChildren = 1; 01658 parent->children = (struct sequence**)global_malloc(sizeof(struct sequence*)); 01659 parent->children[0] = child; 01660 child->regionStart = 0; 01661 child->parent = parent; 01662 }
Here is the call graph for this function:

Here is the caller graph for this function:

| uint4 cluster_overlap | ( | struct sequence * | sequence1, | |
| struct sequence * | sequence2, | |||
| int4 | relativeOffset | |||
| ) |
Definition at line 1624 of file cluster.c.
References sequence::length, and uint4.
Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().
01625 { 01626 uint4 start1, end1; 01627 01628 // Find start of match region 01629 if (relativeOffset > 0) 01630 start1 = relativeOffset; 01631 else 01632 start1 = 0; 01633 01634 // Find end of match region 01635 if (sequence1->length < sequence2->length + relativeOffset) 01636 end1 = sequence1->length; 01637 else 01638 end1 = sequence2->length + relativeOffset; 01639 01640 return end1 - start1; 01641 }
Here is the caller graph for this function:

| void cluster_printCluster | ( | struct parent * | parent | ) |
Definition at line 1588 of file cluster.c.
References parent::children, int4, sequence::length, parent::length, sequence::number, parent::number, parent::numChildren, sequence::parent, print_singleSequence(), sequence::regionStart, sequence::sequence, and parent::sequence.
01589 { 01590 int4 count, childNum, numCharsSaved; 01591 struct sequence* child; 01592 01593 printf("### Parent %d with %d children ###\nParent ", parent->number, parent->numChildren); 01594 print_singleSequence(parent->sequence, parent->length); printf("\n"); 01595 01596 // For each child 01597 numCharsSaved = 0; 01598 childNum = 0; 01599 while (childNum < parent->numChildren) 01600 { 01601 child = parent->children[childNum]; 01602 01603 printf("[%7d] ", child->number); 01604 // Print spaces to align child to parent 01605 count = 0; 01606 while (count < child->regionStart) 01607 { 01608 printf(" "); 01609 count++; 01610 } 01611 01612 // Print child 01613 print_singleSequence(child->sequence, child->length); printf("\n"); 01614 // printf("Child Length=%d start=%d\n", child->length, child->regionStart); 01615 01616 numCharsSaved += child->length; 01617 childNum++; 01618 } 01619 printf("%d bytes saved\n", numCharsSaved - parent->length); 01620 printf("\n"); 01621 }
Here is the call graph for this function:

| void cluster_printList | ( | struct wordLocation * | list, | |
| uint4 | length | |||
| ) |
Definition at line 1520 of file cluster.c.
References wordLocation::offset, wordLocation::sequenceNumber, and uint4.
01521 { 01522 uint4 count = 0; 01523 01524 while (count < length) 01525 { 01526 printf("[%d,%d] ", list[count].sequenceNumber, list[count].offset); 01527 count++; 01528 } 01529 printf(".\n"); 01530 }
| void cluster_printSequence | ( | uint4 | whitespace, | |
| unsigned char * | sequence, | |||
| uint4 | length | |||
| ) |
Definition at line 1507 of file cluster.c.
References print_singleSequence().
01508 { 01509 // Print whitespace before sequence begins 01510 while (whitespace > 0) 01511 { 01512 printf(" "); 01513 whitespace--; 01514 } 01515 01516 print_singleSequence(sequence, length); printf("\n"); 01517 }
Here is the call graph for this function:

| void cluster_processList | ( | struct memSingleBlock * | diagonals, | |
| struct wordLocation * | wordLocations, | |||
| uint4 | numWordLocations | |||
| ) |
Definition at line 1133 of file cluster.c.
References int4, diagonal::matchSequence, memSingleBlock_getCurrent(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), diagonal::numMatches, wordLocation::offset, diagonal::relativeOffset, wordLocation::sequenceNumber, and uint4.
Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().
01135 { 01136 uint4 count1, count2, numNewDiagonalMatches = 0; 01137 struct wordLocation location1, location2; 01138 int4 relativeOffset; 01139 char matchFound; 01140 struct memSingleBlock *sequenceDiagonals; 01141 struct diagonal* diagonalMatch; 01142 01143 // For each pair of locations 01144 numNewDiagonalMatches = 0; 01145 count1 = 0; 01146 while (count1 < numWordLocations) 01147 { 01148 count2 = count1 + 1; 01149 while (count2 < numWordLocations) 01150 { 01151 // location1 < location2 01152 if (wordLocations[count1].sequenceNumber < wordLocations[count2].sequenceNumber) 01153 { 01154 location1 = wordLocations[count1]; 01155 location2 = wordLocations[count2]; 01156 } 01157 else 01158 { 01159 location2 = wordLocations[count1]; 01160 location1 = wordLocations[count2]; 01161 } 01162 01163 // If the locations don't both refer to the same sequence 01164 if (location1.sequenceNumber != location2.sequenceNumber) 01165 { 01166 // Search list of matching diagonals for this sequence for one that is the 01167 // same diagonal and matching sequence as the new match 01168 relativeOffset = location1.offset - location2.offset; 01169 matchFound = 0; 01170 sequenceDiagonals = diagonals + location1.sequenceNumber; 01171 memSingleBlock_resetCurrent(sequenceDiagonals); 01172 while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL) 01173 { 01174 if (diagonalMatch->relativeOffset == relativeOffset && 01175 diagonalMatch->matchSequence == location2.sequenceNumber) 01176 { 01177 // If we found an identinal entry, increment number of matches 01178 matchFound = 1; 01179 diagonalMatch->numMatches++; 01180 break; 01181 } 01182 } 01183 01184 // Otherwise create a new entry, ONLY if neither sequence1 or 2 belong to a cluster 01185 if (!matchFound)// && sequences[wordLocations[count1].sequenceNumber].parent == NULL 01186 // && sequences[wordLocations[count2].sequenceNumber].parent == NULL) 01187 { 01188 diagonalMatch = memSingleBlock_newEntry(sequenceDiagonals); 01189 diagonalMatch->relativeOffset = relativeOffset; 01190 diagonalMatch->matchSequence = location2.sequenceNumber; 01191 diagonalMatch->numMatches = 1; 01192 numNewDiagonalMatches++; 01193 } 01194 } 01195 01196 count2++; 01197 } 01198 count1++; 01199 } 01200 01201 // printf("listLength=%d numNewDiagonalMatches=%d\n", numWordLocations, 01202 // numNewDiagonalMatches); 01203 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void cluster_processSequenceMatches | ( | struct memSingleBlock * | sequenceMatches, | |
| struct sequence * | sequences | |||
| ) |
Definition at line 1054 of file cluster.c.
References memSingleBlock::block, cluster_addChild(), cluster_compareScore(), cluster_mergeParents(), cluster_newParent(), cluster_parents, cluster_score(), int4, memBlocks_newEntry(), memSingleBlock_getCurrent(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, sequence::parent, sequence::regionStart, and wildcardsThreshold.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01055 { 01056 struct sequenceMatch *sequenceMatch; 01057 struct sequence *seq1, *seq2; 01058 struct parent* parent; 01059 int4 score, temp; 01060 01061 printf("Performing hierarchical clustering..."); fflush(stdout); 01062 01063 // Sort the matches by score 01064 qsort(sequenceMatches->block, sequenceMatches->numEntries, sizeof(struct sequenceMatch), 01065 cluster_compareScore); 01066 01067 // For each match 01068 memSingleBlock_resetCurrent(sequenceMatches); 01069 while ((sequenceMatch = memSingleBlock_getCurrent(sequenceMatches)) != NULL) 01070 { 01071 // If second sequence is a parent, reverse sequences 01072 if (sequences[sequenceMatch->sequence2].parent != NULL) 01073 { 01074 temp = sequenceMatch->sequence1; 01075 sequenceMatch->sequence1 = sequenceMatch->sequence2; 01076 sequenceMatch->sequence2 = temp; 01077 01078 sequenceMatch->relativeOffset *= -1; 01079 } 01080 01081 // printf("sequence match %d,%d\n", sequenceMatch->sequence1, sequenceMatch->sequence2); 01082 01083 seq1 = sequences + sequenceMatch->sequence1; 01084 seq2 = sequences + sequenceMatch->sequence2; 01085 01086 //printf("[%d,%d] ", seq1->number, seq2->number); 01087 // Case 1 - neither sequence has a parent 01088 if (seq1->parent == NULL && seq2->parent == NULL) 01089 { 01090 // printf("New cluster\n"); fflush(stdout); 01091 01092 // printf("New parent\n"); 01093 parent = (struct parent*)memBlocks_newEntry(cluster_parents); 01094 cluster_newParent(parent, seq1); 01095 cluster_addChild(parent, seq2, sequenceMatch->relativeOffset); 01096 } 01097 // Case 2 - sequence1 has a parent, sequence2 does not 01098 else if (seq1->parent != NULL && seq2->parent == NULL) 01099 { 01100 // printf("Add to %d\n", seq1->parent->numChildren); fflush(stdout); 01101 01102 // Check score for aligning sequence to parent 01103 sequenceMatch->relativeOffset += seq1->regionStart; 01104 score = cluster_score(seq1, seq2, sequenceMatch->relativeOffset, wildcardsThreshold); 01105 // printf("[%d][%d,%d,%d]\n", score, seq1->parent->length, seq2->length, sequenceMatch->relativeOffset); 01106 01107 if (score > 0) 01108 { 01109 // printf("Add to existing\n"); 01110 cluster_addChild(seq1->parent, seq2, sequenceMatch->relativeOffset); 01111 // printf("!"); fflush(stdout); 01112 } 01113 } 01114 // Case 3 - both sequences have parents 01115 else if (seq1->parent != seq2->parent) 01116 { 01117 // printf("Merge %d and %d\n", seq1->parent->numChildren, seq2->parent->numChildren); fflush(stdout); 01118 01119 sequenceMatch->relativeOffset += seq1->regionStart; 01120 sequenceMatch->relativeOffset -= seq2->regionStart; 01121 score = cluster_score(seq1, seq2, sequenceMatch->relativeOffset, wildcardsThreshold); 01122 if (score > 0) 01123 { 01124 cluster_mergeParents(seq1->parent, seq2->parent, sequenceMatch->relativeOffset); 01125 } 01126 } 01127 } 01128 01129 printf("done.\n"); fflush(stdout); 01130 }
Here is the call graph for this function:

Here is the caller graph for this function:

| uint4 cluster_removeChildren | ( | struct wordLocation * | wordLocations, | |
| uint4 | numWordLocations, | |||
| struct sequence * | sequences | |||
| ) |
Definition at line 1478 of file cluster.c.
References global_malloc(), sequence::parent, sequence::sequence, wordLocation::sequenceNumber, and uint4.
Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01480 { 01481 struct wordLocation *wordLocations2; 01482 uint4 numWordLocations2, locationNum; 01483 struct sequence sequence; 01484 01485 wordLocations2 = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numWordLocations); 01486 numWordLocations2 = 0; 01487 locationNum = 0; 01488 while (locationNum < numWordLocations) 01489 { 01490 // If not a member of a cluster 01491 sequence = sequences[wordLocations[locationNum].sequenceNumber]; 01492 if (sequence.parent == NULL) 01493 { 01494 // Add to new list of word locations 01495 wordLocations2[numWordLocations2] = wordLocations[locationNum]; 01496 numWordLocations2++; 01497 } 01498 locationNum++; 01499 } 01500 01501 // Return new pruned list of word locations 01502 memcpy(wordLocations, wordLocations2, sizeof(struct wordLocation) * numWordLocations2); 01503 free(wordLocations2); 01504 return numWordLocations2; 01505 }
Here is the call graph for this function:

Here is the caller graph for this function:

| uint4 cluster_score | ( | struct sequence * | seq1, | |
| struct sequence * | seq2, | |||
| int4 | relativeOffset, | |||
| float | wildscoreAllowed | |||
| ) |
Definition at line 1907 of file cluster.c.
References clusterWildcard::averageScore, cluster_averageMatchScore, cluster_getWildcode(), encoding_aaStartWildcards, getbit, int4, parent::length, sequence::length, sequence::parent, sequence::regionStart, parent::sequence, sequence::sequence, sequenceMatch::sequence1, sequenceMatch::sequence2, setbit, uint4, wildcards_clusterWildcards, wildcards_numClusterWildcards, wildcardSumWindow, and clusterWildcard::wildCode.
Referenced by cluster_buildCluster(), cluster_clusterSequences(), cluster_processSequenceMatches(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().
01909 { 01910 int4 pos1, pos2, start1, end1, start2, end2; 01911 float sumWildscores = 0, windowWildscoreAllowed, sumWindowWildscores = 0; 01912 uint4 wildCode, wildcardCount; 01913 unsigned char* sequence1, *sequence2; 01914 int4 length1, length2; 01915 struct parent* parent1 = NULL, *parent2 = NULL; 01916 01917 // Check if sequence1 has a parent 01918 if (seq1->parent == NULL) 01919 { 01920 sequence1 = seq1->sequence; 01921 length1 = seq1->length; 01922 } 01923 else 01924 { 01925 parent1 = seq1->parent; 01926 sequence1 = seq1->parent->sequence; 01927 length1 = seq1->parent->length; 01928 relativeOffset += seq1->regionStart; 01929 } 01930 01931 // Check if sequence2 has a parent 01932 if (seq2->parent == NULL) 01933 { 01934 sequence2 = seq2->sequence; 01935 length2 = seq2->length; 01936 } 01937 else 01938 { 01939 parent2 = seq2->parent; 01940 sequence2 = seq2->parent->sequence; 01941 length2 = seq2->parent->length; 01942 relativeOffset -= seq2->regionStart; 01943 } 01944 01945 // Find start of match region 01946 if (relativeOffset > 0) 01947 { 01948 start1 = relativeOffset; 01949 start2 = 0; 01950 } 01951 else 01952 { 01953 start1 = 0; 01954 start2 = -relativeOffset; 01955 } 01956 01957 // Find end of match region 01958 if (length1 < length2 + relativeOffset) 01959 { 01960 end1 = length1; 01961 end2 = length1 - relativeOffset; 01962 } 01963 else 01964 { 01965 end1 = length2 + relativeOffset; 01966 end2 = length2; 01967 } 01968 01969 // Parents do not overlap, return zero 01970 if (relativeOffset > length1 || -relativeOffset > length2) 01971 { 01972 // printf("ro=%d\n", relativeOffset); 01973 // printf("[%d,%d,%d]\n", start1, end1, length1); 01974 // printf("[%d,%d,%d]\n", start2, end2, length2); 01975 return 0; 01976 } 01977 01978 pos1 = start1; 01979 pos2 = start2; 01980 01981 windowWildscoreAllowed = wildscoreAllowed * wildcardSumWindow; 01982 wildscoreAllowed *= (end1 - start1); 01983 01984 // For each position where the two sequences overlap 01985 while (pos1 < end1) 01986 { 01987 // If the sequences differ 01988 if (sequence1[pos1] != sequence2[pos2]) 01989 { 01990 // Build a wildcode for this position 01991 wildCode = 0; 01992 01993 // printf("%c != %c p1=%p p2=%p\n", encoding_getLetter(sequence1[pos1]), 01994 // encoding_getLetter(sequence2[pos2]), parent1, parent2); 01995 01996 // Sequence1 char is included in parent2's wildcard 01997 if (parent1 == NULL && parent2 != NULL && 01998 sequence2[pos2] >= encoding_aaStartWildcards && 01999 getbit(wildcards_clusterWildcards[sequence2[pos2] - 02000 encoding_aaStartWildcards].wildCode, sequence1[pos1])) 02001 { 02002 wildCode = wildcards_clusterWildcards[sequence2[pos2] - 02003 encoding_aaStartWildcards].wildCode; 02004 } 02005 // Sequence2 char is included in parent1's wildcard 02006 else if (parent2 == NULL && parent1 != NULL && 02007 sequence1[pos1] >= encoding_aaStartWildcards && 02008 getbit(wildcards_clusterWildcards[sequence1[pos1] - 02009 encoding_aaStartWildcards].wildCode, sequence2[pos2])) 02010 { 02011 wildCode = wildcards_clusterWildcards[sequence1[pos1] - 02012 encoding_aaStartWildcards].wildCode; 02013 } 02014 else 02015 { 02016 // If we are processing a single sequence (1) 02017 if (parent1 == NULL || sequence1[pos1] < encoding_aaStartWildcards) 02018 { 02019 setbit(wildCode, sequence1[pos1]); 02020 } 02021 // Else we are processing a parent (1) 02022 else 02023 { 02024 wildCode |= cluster_getWildcode(parent1, pos1); 02025 } 02026 02027 // If we a processing a single sequence (2) 02028 if (parent2 == NULL || sequence2[pos2] < encoding_aaStartWildcards) 02029 { 02030 setbit(wildCode, sequence2[pos2]); 02031 } 02032 // Else we are processing a parent (2) 02033 else 02034 { 02035 wildCode |= cluster_getWildcode(parent2, pos2); 02036 } 02037 } 02038 02039 //wildCodeAverageScore = cluster_averageWildcodeScore(wildCode); 02040 //wildCodeAverageScore = 0; 02041 //wildcards_printWildcard(wildCode); 02042 //printf("Average score=%f\n", wildCodeAverageScore); 02043 02044 // Check for a match in set of wildcards 02045 wildcardCount = 0; 02046 while (wildcardCount < wildcards_numClusterWildcards) 02047 { 02048 if ((wildcards_clusterWildcards[wildcardCount].wildCode & wildCode) == wildCode) 02049 { 02050 // printf("Match: "); 02051 // wildcards_printWildcard(wildcards_clusterWildcards[wildcardCount].wildCode); 02052 break; 02053 } 02054 wildcardCount++; 02055 } 02056 02057 sumWildscores += wildcards_clusterWildcards[wildcardCount].averageScore 02058 - cluster_averageMatchScore; 02059 sumWindowWildscores += wildcards_clusterWildcards[wildcardCount].averageScore 02060 - cluster_averageMatchScore; 02061 02062 /* // Add wild's average score to tally 02063 if (wildcards_clusterWildcards[wildcardCount].averageScore > 0) 02064 { 02065 sumWildscores += wildcards_clusterWildcards[wildcardCount].averageScore; 02066 sumWindowWildscores += wildcards_clusterWildcards[wildcardCount].averageScore; 02067 }*/ 02068 02069 // Stop when more than allowed total average score of wilds 02070 if (sumWildscores > wildscoreAllowed) 02071 return 0; 02072 02073 if (sumWindowWildscores > windowWildscoreAllowed) 02074 return 0; 02075 } 02076 02077 // Reset the window sum at the end of each window 02078 if (pos1 % wildcardSumWindow == 0) 02079 sumWindowWildscores = 0; 02080 02081 pos1++; 02082 pos2++; 02083 } 02084 02085 // Return score 02086 // return (end1 - start1) * (wildscoreAllowed - sumWildscores) / wildscoreAllowed; 02087 return 2000 - (1000 * sumWildscores / (end1 - start1)); 02088 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void cluster_simpleClusterSequences | ( | struct sequence * | sequences, | |
| uint4 | numberOfSequences, | |||
| uint4 | numberOfLetters | |||
| ) |
Definition at line 767 of file cluster.c.
References cluster_buildCluster(), cluster_buildPSSM(), cluster_compareSequenceLengths(), cluster_freePSSM(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), global_malloc(), sequence::length, maxListLength, memSingleBlock_free(), memSingleBlock_initialize(), memSingleBlock_newEntry(), wordLocation::offset, wordLocation::sequenceNumber, uint4, and wildcardsThreshold.
00769 { 00770 uint4 sequenceLength = 0, sequenceCount, startGroup, endGroup, numWordLocations; 00771 struct sequence* sequence, *seq1, *seq2; 00772 struct wordLocation* wordLocations; 00773 uint4 count1, count2, score; 00774 struct memSingleBlock* sequenceMatches; 00775 struct sequenceMatch *sequenceMatch; 00776 00777 // Temporary buffer for storing word locations 00778 wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences); 00779 00780 // Sort sequences by length 00781 qsort(sequences, numberOfSequences, sizeof(struct sequence), 00782 cluster_compareSequenceLengths); 00783 00784 sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences); 00785 00786 // For each sequence in order of length 00787 sequenceCount = 0; 00788 while (sequenceCount < numberOfSequences) 00789 { 00790 sequence = sequences + sequenceCount; 00791 sequenceLength = sequence->length; 00792 startGroup = sequenceCount; 00793 00794 // Find end of group of sequences with same length 00795 while (sequenceCount < numberOfSequences) 00796 { 00797 sequence = sequences + sequenceCount; 00798 00799 if (sequence->length != sequenceLength) 00800 { 00801 break; 00802 } 00803 sequenceCount++; 00804 } 00805 endGroup = sequenceCount; 00806 00807 numWordLocations = 0; 00808 while (numWordLocations < endGroup - startGroup) 00809 { 00810 wordLocations[numWordLocations].sequenceNumber = startGroup + numWordLocations; 00811 wordLocations[numWordLocations].offset = 0; 00812 numWordLocations++; 00813 } 00814 00815 if (numWordLocations >= maxListLength && 00816 !cluster_buildPSSM(wordLocations, numWordLocations, sequences)) 00817 { 00818 while (numWordLocations >= maxListLength) 00819 { 00820 // Build cluster from longest list of word locations 00821 cluster_buildCluster(wordLocations, numWordLocations, sequences); 00822 00823 // cluster_printList(wordLocations, numWordLocations); 00824 00825 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences); 00826 // printf("Now %d\n", numWordLocations); fflush(stdout); 00827 } 00828 00829 // Free the PSSM 00830 cluster_freePSSM(); 00831 } 00832 00833 // For each pair of remaining sequences 00834 count1 = 0; 00835 while (count1 < numWordLocations) 00836 { 00837 count2 = count1 + 1; 00838 while (count2 < numWordLocations) 00839 { 00840 seq1 = sequences + wordLocations[count1].sequenceNumber; 00841 seq2 = sequences + wordLocations[count2].sequenceNumber; 00842 00843 // Compare the two sequences in detail 00844 score = cluster_score(seq1, seq2, 0, wildcardsThreshold); 00845 00846 if (score > 0) 00847 { 00848 // Record high-scoring match between pair of sequences 00849 sequenceMatch = memSingleBlock_newEntry(sequenceMatches); 00850 sequenceMatch->sequence1 = wordLocations[count1].sequenceNumber; 00851 sequenceMatch->sequence2 = wordLocations[count2].sequenceNumber; 00852 sequenceMatch->relativeOffset = 0; 00853 sequenceMatch->score = score; 00854 } 00855 00856 count2++; 00857 } 00858 count1++; 00859 } 00860 printf("length=%d num=%d\n", sequence->length, endGroup - startGroup); 00861 } 00862 00863 // Process sequence matches 00864 cluster_processSequenceMatches(sequenceMatches, sequences); 00865 00866 memSingleBlock_free(sequenceMatches); 00867 }
Here is the call graph for this function:

| void cluster_spexClusterSequences | ( | struct sequence * | sequences, | |
| uint4 | numberOfSequences, | |||
| uint4 | numberOfLetters | |||
| ) |
Definition at line 453 of file cluster.c.
References memSingleBlock::block, cluster_buildCluster(), cluster_buildPSSM(), cluster_freePSSM(), cluster_overlap(), cluster_processList(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), decoQantum, decoWordLength, global_malloc(), hashcounter_count(), hashcounter_free(), hashcounter_hash, hashcounter_insert(), hashcounter_multiple(), hashcounter_new(), hashcounter_single(), int4, sequence::length, diagonal::matchSequence, maxListLength, memSingleBlock_free(), memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_initializeExisting(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, numHashtablePasses, numIterations, diagonal::numMatches, percentMatchesRequired, postings_addEntry(), postings_decodeList(), postings_getSortedLists(), postings_initialize(), postings_numLists, diagonal::relativeOffset, sequence::sequence, uint4, and wildcardsThreshold.
Referenced by main().
00455 { 00456 Hashcounter *hashCounter1 = NULL, *hashCounter2 = NULL; 00457 uint4 words, duplicates, iteration = 0, seed, wordLength; 00458 uint4 sequenceNumber, sequenceSize, offset, suboffset; 00459 unsigned char* sequence, *word; 00460 uint4 numHashtablePasses = 50, hashtablePass = 0, passhash; 00461 uint4 hashCounterSize = 1, sinceChange, submatches; 00462 int4 key, numWordLocations; 00463 struct wordLocation* wordLocations; 00464 int4 relativeOffset; 00465 uint4 sequence1, sequence2, score; 00466 struct memSingleBlock* diagonals, *sequenceDiagonals; 00467 struct diagonal* diagonalMatch; 00468 struct sequenceMatch *sequenceMatch; 00469 struct memSingleBlock* sequenceMatches; 00470 struct sequence* seq1, *seq2; 00471 int4 time; 00472 struct finalList* finalLists; 00473 uint4 hashValue1, hashValue2, position, lookup; 00474 00475 // Calculate size of hashcounter 00476 while (pow(2,hashCounterSize) < numberOfLetters / decoQantum) 00477 { 00478 hashCounterSize++; 00479 } 00480 hashCounter2 = hashcounter_new(hashCounterSize, 0); 00481 00482 wordLength = decoWordLength - ((numIterations - 1) * decoQantum); 00483 00484 while (iteration < numIterations) 00485 { 00486 time = -clock(); 00487 words = 0; duplicates = 0; 00488 sinceChange = 0; 00489 00490 // For each sequence 00491 sequenceNumber = 0; 00492 while (sequenceNumber < numberOfSequences) 00493 { 00494 sequence = sequences[sequenceNumber].sequence; 00495 sequenceSize = sequences[sequenceNumber].length; 00496 00497 // For each sequence longer enough that hasn't been excluded 00498 if (sequenceSize >= wordLength) 00499 { 00500 // For each word 00501 offset = 0; 00502 while (offset < sequenceSize - wordLength) 00503 { 00504 // If word has occured before 00505 word = sequence + offset; 00506 hashcounter_hash(hashCounter2, word, wordLength, hashValue2, position); 00507 lookup = hashcounter_single(hashCounter2, hashValue2); 00508 00509 if (lookup) 00510 sinceChange = 0; 00511 00512 if (lookup == 1) 00513 { 00514 // If it has only occured once, now it has occured twice 00515 hashcounter_insert(hashCounter2, hashValue2); 00516 } 00517 else 00518 { 00519 submatches = 0; 00520 // First iteration 00521 if (hashCounter1 != NULL) 00522 { 00523 suboffset = offset; 00524 while (suboffset <= offset + decoQantum) 00525 { 00526 word = sequence + suboffset; 00527 hashcounter_hash(hashCounter1, word, wordLength - decoQantum, hashValue1, position); 00528 if (hashcounter_multiple(hashCounter1, hashValue1)) 00529 submatches++; 00530 suboffset++; 00531 } 00532 } 00533 if (sinceChange >= decoQantum && (hashCounter1 == NULL || submatches >= 2)) 00534 { 00535 hashcounter_insert(hashCounter2, hashValue2); 00536 sinceChange = 0; 00537 } 00538 } 00539 00540 sinceChange++; 00541 offset++; 00542 } 00543 } 00544 00545 sequenceNumber++; 00546 } 00547 00548 if (hashCounter1) 00549 hashcounter_free(hashCounter1); 00550 hashCounter1 = hashCounter2; 00551 seed = rand(); 00552 00553 if (iteration != numIterations - 1) 00554 hashCounter2 = hashcounter_new(hashCounterSize, 0); 00555 00556 printf("Iteration %d WordLength=%d\n", iteration, wordLength); fflush(stdout); 00557 hashcounter_count(hashCounter1, stdout); fflush(stdout); 00558 // printf("Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout); 00559 00560 time += clock(); 00561 printf("Iteration time=%.2f secs\n", (float)time / CLOCKS_PER_SEC); fflush(stdout); 00562 00563 iteration++; 00564 wordLength += decoQantum; 00565 } 00566 00567 // Temporary buffer for storing word locations 00568 wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences); 00569 00570 // Initialize diagonals 00571 diagonals = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock) 00572 * numberOfSequences); 00573 sequenceNumber = 0; 00574 while (sequenceNumber < numberOfSequences) 00575 { 00576 memSingleBlock_initializeExisting(diagonals + sequenceNumber, sizeof(struct diagonal), 1); 00577 sequenceNumber++; 00578 } 00579 00580 printf("Initialized diagonals\n"); fflush(stdout); 00581 00582 wordLength -= decoQantum; 00583 seed = rand(); 00584 00585 printf("Constructing and processing postings lists"); fflush(stdout); 00586 // Process collection a section at a time 00587 while (hashtablePass < numHashtablePasses) 00588 { 00589 time = -clock(); 00590 postings_initialize(); 00591 00592 // For each sequence 00593 sequenceNumber = 0; 00594 while (sequenceNumber < numberOfSequences) 00595 { 00596 sequence = sequences[sequenceNumber].sequence; 00597 sequenceSize = sequences[sequenceNumber].length; 00598 00599 // For each sequence long enough that hasn't been excluded 00600 if (sequenceSize >= wordLength) 00601 { 00602 // Create initial passhash 00603 passhash = 0; 00604 offset = 0; 00605 while (offset < wordLength) 00606 { 00607 passhash += sequence[offset]; 00608 offset++; 00609 } 00610 00611 // For each word 00612 offset = 0; 00613 while (offset < sequenceSize - wordLength) 00614 { 00615 // If to be processed this pass 00616 if (passhash % numHashtablePasses == hashtablePass) 00617 { 00618 // Insert into hash table 00619 word = sequence + offset; 00620 hashcounter_hash(hashCounter1, word, wordLength, hashValue1, position); 00621 if (hashcounter_multiple(hashCounter1, hashValue1)) 00622 { 00623 postings_addEntry(sequence + offset, wordLength, sequenceNumber, offset); 00624 } 00625 } 00626 00627 // Update passhash 00628 passhash -= sequence[offset]; 00629 passhash += sequence[offset + wordLength]; 00630 offset++; 00631 } 00632 } 00633 00634 sequenceNumber++; 00635 } 00636 00637 // Report the load on the hashtable 00638 // printf("Hashtable pass %d\n", hashtablePass); 00639 // postings_print(); 00640 // printf("Total malloced=%s\n", global_int4toString(global_totalMalloc)); 00641 time += clock(); 00642 // printf("Hashtable construction time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout); 00643 time = -clock(); 00644 00645 // Sort hash table entries in order of size 00646 finalLists = postings_getSortedLists(); 00647 00648 // Start with the longest entry 00649 key = postings_numLists - 1; 00650 while (key >= 0) 00651 { 00652 // Get each entry in descending order of size 00653 numWordLocations = postings_decodeList(finalLists[key], wordLocations); 00654 00655 if (numWordLocations > maxListLength) 00656 { 00657 // printf("Was %d\n", numWordLocations); fflush(stdout); 00658 00659 // Remove any locations refering to already clustered sequences 00660 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences); 00661 00662 if (numWordLocations >= maxListLength && 00663 !cluster_buildPSSM(wordLocations, numWordLocations, sequences)) 00664 { 00665 while (numWordLocations >= maxListLength) 00666 { 00667 // Build cluster from longest list of word locations 00668 cluster_buildCluster(wordLocations, numWordLocations, sequences); 00669 00670 // cluster_printList(wordLocations, numWordLocations); 00671 00672 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences); 00673 // printf("Now %d\n", numWordLocations); fflush(stdout); 00674 } 00675 00676 // Free the PSSM 00677 cluster_freePSSM(); 00678 } 00679 } 00680 00681 // Process a list of word locations and insert/update diagonal entries 00682 if (numWordLocations > 1) 00683 cluster_processList(diagonals, wordLocations, numWordLocations); 00684 00685 key--; 00686 } 00687 00688 free(finalLists); 00689 00690 time += clock(); 00691 // printf("Hashtable processing time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout); 00692 printf("."); fflush(stdout); 00693 00694 hashtablePass++; 00695 } 00696 00697 printf("done.\n"); fflush(stdout); 00698 free(wordLocations); 00699 00700 // Free hash counter 00701 hashcounter_free(hashCounter1); 00702 00703 printf("Identifying high-scoring sequence pairs..."); fflush(stdout); 00704 00705 sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences); 00706 00707 // For each sequence 00708 sequenceNumber = 0; 00709 while (sequenceNumber < numberOfSequences) 00710 { 00711 // Go through list of matching diagonals 00712 sequenceDiagonals = diagonals + sequenceNumber; 00713 memSingleBlock_resetCurrent(sequenceDiagonals); 00714 while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL) 00715 { 00716 sequence1 = sequenceNumber; 00717 sequence2 = diagonalMatch->matchSequence; 00718 relativeOffset = diagonalMatch->relativeOffset; 00719 seq1 = sequences + sequence1; 00720 seq2 = sequences + sequence2; 00721 00722 // Calculate rough score based on number of matches, length of overlap region 00723 score = 100 * diagonalMatch->numMatches / 00724 cluster_overlap(seq1, seq2, relativeOffset) + decoWordLength; 00725 00726 if (score > percentMatchesRequired)// && relativeOffset == 0 && seq1->length == seq2->length) 00727 { 00728 // Compare the two sequences in detail 00729 score = cluster_score(seq1, seq2, relativeOffset, wildcardsThreshold); 00730 if (score > 0) 00731 { 00732 // printf("Sequences %d,%d lengths %d,%d offset %d matches=%d score=%d\n", 00733 // sequence1, sequence2, sequences[sequence1].length, sequences[sequence2].length, 00734 // relativeOffset, diagonalMatch->numMatches, score); 00735 00736 // Record high-scoring match between pair of sequences 00737 sequenceMatch = memSingleBlock_newEntry(sequenceMatches); 00738 sequenceMatch->sequence1 = sequence1; 00739 sequenceMatch->sequence2 = sequence2; 00740 sequenceMatch->relativeOffset = relativeOffset; 00741 sequenceMatch->score = score; 00742 } 00743 } 00744 00745 } 00746 sequenceNumber++; 00747 } 00748 00749 printf("done. (%d pairs)\n", sequenceMatches->numEntries); fflush(stdout); 00750 00751 // Free memory used to store diagonal matches 00752 sequenceNumber = 0; 00753 while (sequenceNumber < numberOfSequences) 00754 { 00755 free(diagonals[sequenceNumber].block); 00756 sequenceNumber++; 00757 } 00758 free(diagonals); 00759 00760 // Process sequence matches 00761 cluster_processSequenceMatches(sequenceMatches, sequences); 00762 00763 memSingleBlock_free(sequenceMatches); 00764 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void cluster_updateWildcode | ( | struct parent * | parent, | |
| uint4 | position, | |||
| uint4 | wildCode | |||
| ) |
Definition at line 1692 of file cluster.c.
References global_realloc(), parent::numWildcards, sequence::parent, wildCode::position, uint4, wildCode::wildCode, and parent::wildCodes.
Referenced by cluster_addChild(), and cluster_mergeParents().
01693 { 01694 uint4 count; 01695 01696 count = 0; 01697 while (count < parent->numWildcards) 01698 { 01699 if (parent->wildCodes[count].position == position) 01700 { 01701 parent->wildCodes[count].wildCode = wildCode; 01702 return; 01703 } 01704 count++; 01705 } 01706 01707 // If position not found, add wildcode 01708 parent->numWildcards++; 01709 parent->wildCodes = (struct wildCode*)global_realloc(parent->wildCodes, parent->numWildcards 01710 * sizeof(struct wildCode)); 01711 01712 parent->wildCodes[parent->numWildcards - 1].position = position; 01713 parent->wildCodes[parent->numWildcards - 1].wildCode = wildCode; 01714 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void cluster_updateWildcodes | ( | struct parent * | parent, | |
| int4 | change | |||
| ) |
Definition at line 1665 of file cluster.c.
References sequence::parent, wildCode::position, uint4, and parent::wildCodes.
Referenced by cluster_addChild().
01666 { 01667 uint4 count = 0; 01668 while (count < parent->numWildcards) 01669 { 01670 parent->wildCodes[count].position += change; 01671 count++; 01672 } 01673 }
Here is the caller graph for this function:

| void cluster_writeClusters | ( | char * | filename, | |
| struct sequence * | sequences | |||
| ) |
Definition at line 252 of file cluster.c.
References parent::children, cluster_getEdits(), cluster_parents, wild::code, child::description, child::descriptionLength, sequence::descriptionLength, sequence::descriptionLocation, descriptions_getDescription(), child::edits, global_malloc(), sequence::length, child::length, parent::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), parent::numChildren, child::numEdits, readdb_dbAlphabetType, readdb_longestSequenceLength, readdb_numberOfSequences, sequence::regionStart, child::regionStart, sequence::sequence, parent::sequence, uint4, wildcards_clusterWildcards, wildcards_countOccurences(), wildcards_getOccurences(), wildcards_initializeCountOccurences(), wildcards_numClusterWildcards, wildcards_outputWildcards(), wildcards_scoreCandidates(), clusterWildcard::wildCode, parent::wildCodes, writedb_addSequence(), writedb_close(), writedb_dataFilename, writedb_descriptionsFilename, writedb_initialize(), and writedb_sequenceFilename.
Referenced by main().
00253 { 00254 struct parent* parent; 00255 char *newFilename, *wildcardsOutFilename, *renamedFilename; 00256 uint4 count, sequenceNumber, sequenceLength, descriptionLength, descriptionLocation; 00257 unsigned char *sequence, *description; 00258 uint4 childNum, numChildren, descriptionTotalLength, numWilds; 00259 struct child* children; 00260 struct wild* wilds, *wildcards; 00261 00262 printf("Writing clusters to disk..."); fflush(stdout); 00263 00264 // Construct sequence and description filenames 00265 newFilename = (char*)global_malloc(strlen(filename) + 12); 00266 sprintf(newFilename, "%s_clustered", filename); 00267 00268 // Initialize writing to formatted database 00269 writedb_initialize(newFilename, readdb_dbAlphabetType); 00270 00271 // Initialize wildcard occurence counting 00272 wildcards_initializeCountOccurences(readdb_longestSequenceLength); 00273 00274 // For each parent 00275 memBlocks_resetCurrent(cluster_parents); 00276 while ((parent = memBlocks_getCurrent(cluster_parents)) != NULL) 00277 { 00278 if (parent->children != NULL && parent->numChildren > 1) 00279 { 00280 sequenceLength = parent->length; 00281 sequence = parent->sequence; 00282 numChildren = parent->numChildren; 00283 00284 children = (struct child*)global_malloc(sizeof(struct child) * numChildren); 00285 00286 // For each child 00287 childNum = 0; 00288 descriptionTotalLength = 0; 00289 while (childNum < numChildren) 00290 { 00291 // Load description from old collection 00292 descriptionLocation = parent->children[childNum]->descriptionLocation; 00293 descriptionLength = parent->children[childNum]->descriptionLength; 00294 description = descriptions_getDescription(descriptionLocation, descriptionLength); 00295 // description = parent->children[childNum]->description; 00296 00297 children[childNum].description = description; 00298 children[childNum].descriptionLength = descriptionLength; 00299 children[childNum].regionStart = parent->children[childNum]->regionStart; 00300 children[childNum].length = parent->children[childNum]->length; 00301 children[childNum].edits = cluster_getEdits(parent, parent->children[childNum], 00302 &(children[childNum].numEdits)); 00303 childNum++; 00304 } 00305 00306 // Add sequence to the formatted collection 00307 writedb_addSequence(sequence, sequenceLength, NULL, 0, NULL, 00308 0, children, numChildren); 00309 00310 // Count final wildcard occurences 00311 wildcards_countOccurences(children, numChildren, sequenceLength); 00312 00313 // Free memory used for children and edits 00314 childNum = 0; 00315 while (childNum < numChildren) 00316 { 00317 free(children[childNum].edits); 00318 free(children[childNum].description); 00319 childNum++; 00320 } 00321 free(children); 00322 00323 // Print 00324 // if (numChildren > 1) 00325 // cluster_printCluster(parent); 00326 } 00327 00328 // Free the parent 00329 free(parent->sequence); 00330 free(parent->children); 00331 free(parent->wildCodes); 00332 } 00333 00334 printf("done.\nWriting remaining sequences to disk..."); fflush(stdout); 00335 00336 // For each sequence without a parent or only child 00337 sequenceNumber = 0; 00338 while (sequenceNumber < readdb_numberOfSequences) 00339 { 00340 if (sequences[sequenceNumber].parent == NULL || 00341 sequences[sequenceNumber].parent->numChildren == 1) 00342 { 00343 sequenceLength = sequences[sequenceNumber].length; 00344 sequence = sequences[sequenceNumber].sequence; 00345 descriptionLength = sequences[sequenceNumber].descriptionLength; 00346 descriptionLocation = sequences[sequenceNumber].descriptionLocation; 00347 00348 // Load description from old collection 00349 // description = sequences[sequenceNumber].description; 00350 description = descriptions_getDescription(descriptionLocation, descriptionLength); 00351 00352 // Add sequence to the formatted collection 00353 writedb_addSequence(sequence, sequenceLength, description, 00354 descriptionLength, NULL, 0, NULL, 0); 00355 00356 free(description); 00357 } 00358 00359 // Free the sequence 00360 // free(sequences[sequenceNumber].sequence); 00361 00362 sequenceNumber++; 00363 } 00364 00365 printf("done.\n"); fflush(stdout); 00366 00367 // Get list of wild occurences 00368 wilds = wildcards_getOccurences(&numWilds); 00369 00370 // Score the wildcards using occurences 00371 wildcards = (struct wild*)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); 00372 count = 0; 00373 while (count < wildcards_numClusterWildcards) 00374 { 00375 wildcards[count].code = wildcards_clusterWildcards[count].wildCode; 00376 count++; 00377 } 00378 wildcards_scoreCandidates(wildcards, wildcards_numClusterWildcards, wilds, numWilds, 0); 00379 free(wilds); 00380 00381 // Output wildcards scoring info 00382 wildcardsOutFilename = (char*)global_malloc(strlen(newFilename) + 12); 00383 sprintf(wildcardsOutFilename, "%s.wildcards", newFilename); 00384 wildcards_outputWildcards(wildcardsOutFilename); 00385 00386 // Finalize writing to the new formatted collection 00387 writedb_close(); 00388 00389 // Rename clustered collection 00390 renamedFilename = (char*)global_malloc(strlen(filename) + 20); 00391 sprintf(renamedFilename, "%s.wildcards", filename); 00392 if (rename(wildcardsOutFilename, renamedFilename)) 00393 fprintf(stderr, "Error renaming file %s to %s\n", wildcardsOutFilename, renamedFilename); 00394 00395 sprintf(renamedFilename, "%s.sequences", filename); 00396 if (rename(writedb_sequenceFilename, renamedFilename)) 00397 fprintf(stderr, "Error renaming file %s to %s\n", writedb_sequenceFilename, renamedFilename); 00398 00399 sprintf(renamedFilename, "%s.descriptions", filename); 00400 if (rename(writedb_descriptionsFilename, renamedFilename)) 00401 fprintf(stderr, "Error renaming file %s to %s\n", writedb_descriptionsFilename, renamedFilename); 00402 00403 sprintf(renamedFilename, "%s.data", filename); 00404 if (rename(writedb_dataFilename, renamedFilename)) 00405 fprintf(stderr, "Error renaming file %s to %s\n", writedb_dataFilename, renamedFilename); 00406 }
Here is the call graph for this function:

Here is the caller graph for this function:

| int4 main | ( | int4 | argc, | |
| char * | argv[] | |||
| ) |
Definition at line 135 of file cluster.c.
References scoreMatrix::averageMatchScore, cluster_averageMatchScore, cluster_averageWildcodeScores, cluster_calculateSavings(), cluster_parents, cluster_spexClusterSequences(), cluster_writeClusters(), sequence::descriptionLength, sequence::descriptionLocation, encoding_alphabetTypes, encoding_free(), encoding_initialize(), encoding_protein, global_free(), global_malloc(), sequence::length, memBlocks_free(), memBlocks_initialize(), sequence::number, parameters_findScoringMatrix(), parameters_free(), parameters_scoringMatrixPath, sequence::parent, readdb_close(), readdb_dbAlphabetType, readdb_longestSequenceLength, readdb_numberOfClusters, readdb_numberOfLetters, readdb_numberOfSequences, readdb_open(), readdb_readSequence(), scoreMatrix_free(), scoreMatrix_load(), sequence::sequence, uint4, wildcards_initialize(), wildcards_readWildcards(), and wildcards_scoreMatrix.
00136 { 00137 unsigned char *filename, *sequence; 00138 uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength; 00139 uint4 sequenceNumber = 0, encodedLength; 00140 struct sequence* sequences; 00141 00142 // User must provide FASTA format file at command line 00143 if (argc < 2) 00144 { 00145 fprintf(stderr, "Useage: cluster <Database>\n"); 00146 exit(-1); 00147 } 00148 filename = argv[1]; 00149 00150 // Open formatted file for reading 00151 readdb_open(filename); 00152 00153 printf("Number of sequences = %u\n", readdb_numberOfSequences); 00154 printf("Total number of letters = %llu\n", readdb_numberOfLetters); 00155 printf("Length of longest sequence = %u\n", readdb_longestSequenceLength); 00156 printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]); 00157 00158 if (readdb_dbAlphabetType != encoding_protein) 00159 { 00160 fprintf(stderr, "Error: Only protein collection clustering currently supported\n"); 00161 exit(-1); 00162 } 00163 00164 if (readdb_numberOfSequences != readdb_numberOfClusters) 00165 { 00166 fprintf(stderr, "Error clustering %s: collection has already been clustered\n", filename); 00167 exit(-1); 00168 } 00169 00170 sequences = (struct sequence*)global_malloc(sizeof(struct sequence) * readdb_numberOfSequences); 00171 00172 // Initialize codes array 00173 encoding_initialize(readdb_dbAlphabetType); 00174 00175 // Initialize cluster wildcards 00176 wildcards_initialize(); 00177 00178 if (argc == 5) 00179 { 00180 // Read in a set of wildcards 00181 wildcards_readWildcards(argv[4]); 00182 } 00183 00184 // Load score matrix 00185 parameters_findScoringMatrix(); 00186 wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath); 00187 00188 // Calculate average score for matching two residues 00189 cluster_averageMatchScore = wildcards_scoreMatrix.averageMatchScore; 00190 00191 // Move through index file reading sequence codes and converting back to ASCII sequences 00192 sequenceNumber = 0; 00193 while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart, 00194 &descriptionLength, &encodedLength)) 00195 { 00196 // Record sequence, sequence length, number, no parent 00197 sequences[sequenceNumber].sequence = sequence; 00198 sequences[sequenceNumber].length = sequenceLength; 00199 sequences[sequenceNumber].number = sequenceNumber; 00200 sequences[sequenceNumber].parent = NULL; 00201 sequences[sequenceNumber].descriptionLength = descriptionLength; 00202 sequences[sequenceNumber].descriptionLocation = descriptionStart; 00203 00204 sequenceNumber++; 00205 } 00206 00207 if (sequenceNumber != readdb_numberOfSequences) 00208 { 00209 fprintf(stderr, "Error: only %d sequences read\n", sequenceNumber); 00210 exit(-1); 00211 } 00212 00213 printf("%d sequences read.\n", sequenceNumber); 00214 fflush(stdout); 00215 00216 // Initialize array for storing parents 00217 cluster_parents = memBlocks_initialize(sizeof(struct parent), 10000); 00218 00219 // Simple approach to clustering sequences 00220 // cluster_simpleClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters); 00221 00222 // Perform SPEX algorithm and cluster sequences 00223 cluster_spexClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters); 00224 00225 // Cluster sequences in the collection 00226 // cluster_clusterSequences(sequences, sequenceNumber, readdb_numberOfLetters, filename); 00227 00228 printf("Total bytes saved=%d\n", cluster_calculateSavings()); fflush(stdout); 00229 00230 // printf("END Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout); 00231 00232 // Load sequence descriptions from disk 00233 // cluster_loadDescriptions(sequences); 00234 00235 // Write clusters to disk 00236 cluster_writeClusters(filename, sequences); 00237 00238 // Close reading formatted database 00239 readdb_close(); 00240 00241 free(cluster_averageWildcodeScores); 00242 free(sequences); 00243 memBlocks_free(cluster_parents); 00244 encoding_free(); 00245 global_free(); 00246 scoreMatrix_free(wildcards_scoreMatrix); 00247 parameters_free(); 00248 return 0; 00249 }
Here is the call graph for this function:

| float cluster_averageMatchScore = 0 |
| float* cluster_averageWildcodeScores = NULL |
| uint4 cluster_numParents = 0 |
| struct memBlocks* cluster_parents |
Definition at line 71 of file cluster.c.
Referenced by cluster_buildCluster(), cluster_calculateSavings(), cluster_processSequenceMatches(), cluster_writeClusters(), and main().
| uint4** cluster_PSSM |
Definition at line 73 of file cluster.c.
Referenced by cluster_buildCluster(), cluster_buildPSSM(), and cluster_freePSSM().
| int4 cluster_PSSMend |
| int4 cluster_PSSMlength |
| int4 cluster_PSSMstart |
| float wildcardsThreshold = 0.25 |
Definition at line 69 of file cluster.c.
Referenced by cluster_buildCluster(), cluster_clusterSequences(), cluster_processSequenceMatches(), cluster_simpleClusterSequences(), cluster_spexClusterSequences(), and rsdb_spexClusterSequences().
1.5.2