src/cluster.c File Reference

#include "blast.h"
#include "vbyte2.h"
#include "readindex.h"
#include "vec.h"
#include "vbyte2.c"
#include "readindex.c"
#include "vec.c"

Include dependency graph for cluster.c:

Go to the source code of this file.

Data Structures

struct  diagonal
struct  sequenceMatch
struct  sequence
struct  parent
struct  wildCode

Defines

#define decoWordLength   30
#define numIterations   3
#define decoQantum   9
#define maxListLength   100
#define wildcardSumWindow   100
#define percentMatchesRequired   15

Functions

void cluster_printSequence (uint4 whitespace, unsigned char *sequence, uint4 length)
void cluster_processSequenceMatches (struct memSingleBlock *sequenceMatches, struct sequence *sequences)
void cluster_simpleClusterSequences (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters)
void cluster_clusterSequences (struct sequence *sequences, uint4 numSequences, uint4 numberOfLetters, char *filename)
int4 cluster_compareScore (const void *sequenceMatch1, const void *sequenceMatch2)
void cluster_processList (struct memSingleBlock *diagonals, struct wordLocation *wordLocations, uint4 numWordLocations)
void cluster_printCluster (struct parent *parent)
uint4 cluster_score (struct sequence *seq1, struct sequence *seq2, int4 relativeOffset, float wildscoreAllowed)
uint4 cluster_overlap (struct sequence *sequence1, struct sequence *sequence2, int4 relativeOffset)
void cluster_newParent (struct parent *newParent, struct sequence *child)
void cluster_addChild (struct parent *parent, struct sequence *child, int4 relativeOffset)
void cluster_mergeParents (struct parent *parent1, struct parent *parent2, int4 relativeOffset)
uint4 cluster_checkMerge (struct wordLocation *wordLocations1, uint4 numWordLocations1, struct wordLocation *wordLocations2, uint4 numWordLocations2, int4 *bestDiagonal)
wordLocationcluster_mergeLists (struct wordLocation *wordLocations1, uint4 *numWordLocations1, struct wordLocation *wordLocations2, uint4 *numWordLocations2, int4 mergeRelativeOffset, uint *lengthNewList)
void cluster_printList (struct wordLocation *list, uint4 length)
int cluster_buildCluster (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences)
uint4 cluster_removeChildren (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences)
int4 cluster_calculateSavings ()
void cluster_writeClusters (char *filename, struct sequence *sequences)
editcluster_getEdits (struct parent *parent, struct sequence *child, uint4 *numEdits)
int cluster_buildPSSM (struct wordLocation *wordLocations, uint4 numWordLocations, struct sequence *sequences)
float cluster_averageWildcodeScore (uint4 wildcode)
void cluster_freePSSM ()
void cluster_spexClusterSequences (struct sequence *sequences, uint4 numberOfSequences, uint4 numberOfLetters)
int4 main (int4 argc, char *argv[])
int4 cluster_compareSequenceLengths (const void *sequence1, const void *sequence2)
void cluster_updateWildcodes (struct parent *parent, int4 change)
uint4 cluster_getWildcode (struct parent *parent, uint4 position)
void cluster_updateWildcode (struct parent *parent, uint4 position, uint4 wildCode)

Variables

float cluster_averageMatchScore = 0
float * cluster_averageWildcodeScores = NULL
float wildcardsThreshold = 0.25
uint4 cluster_numMergeDiagonals
memBlockscluster_parents
uint4 cluster_numParents = 0
uint4 ** cluster_PSSM
int4 cluster_PSSMstart
int4 cluster_PSSMend
int4 cluster_PSSMlength


Define Documentation

#define decoQantum   9

Definition at line 19 of file cluster.c.

Referenced by cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), and rsdb_winnowingSpex().

#define decoWordLength   30

Definition at line 17 of file cluster.c.

Referenced by cluster_clusterSequences(), cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), rsdb_spexClusterSequences(), and rsdb_winnowingSpex().

#define maxListLength   100

Definition at line 20 of file cluster.c.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), cluster_spexClusterSequences(), main(), and rsdb_spexClusterSequences().

#define numIterations   3

Definition at line 18 of file cluster.c.

Referenced by cluster_spexClusterSequences(), main(), rsdb_slottedSpex(), and rsdb_winnowingSpex().

#define percentMatchesRequired   15

Definition at line 22 of file cluster.c.

Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().

#define wildcardSumWindow   100

Definition at line 21 of file cluster.c.

Referenced by cluster_score().


Function Documentation

void cluster_addChild ( struct parent parent,
struct sequence child,
int4  relativeOffset 
)

Definition at line 1771 of file cluster.c.

References parent::children, cluster_getWildcode(), cluster_updateWildcode(), cluster_updateWildcodes(), encoding_aaStartWildcards, getbit, global_malloc(), global_realloc(), int4, sequence::length, parent::length, parent::numChildren, sequence::parent, sequence::regionStart, sequence::sequence, parent::sequence, setbit, wildcards_clusterWildcards, wildcards_numClusterWildcards, and wildcards_printWildcard().

Referenced by cluster_buildCluster(), cluster_mergeParents(), and cluster_processSequenceMatches().

01772 {
01773         int4 childNum, count, wildCode, wildcardCount;
01774         unsigned char* newParentSequence;
01775 
01776     if (relativeOffset < 0)
01777     {
01778         // Child starts before parent, add new sequence at beginning of parent
01779                 newParentSequence = (char*)global_malloc(parent->length - relativeOffset);
01780         memcpy(newParentSequence - relativeOffset, parent->sequence, parent->length);
01781                 memcpy(newParentSequence, child->sequence, -relativeOffset);
01782         free(parent->sequence);
01783         parent->sequence = newParentSequence;
01784         parent->length -= relativeOffset;
01785 
01786         // Update children coordinates
01787         childNum = 0;
01788         while (childNum < parent->numChildren)
01789         {
01790                 parent->children[childNum]->regionStart -= relativeOffset;
01791                 childNum++;
01792         }
01793 
01794         // Update wildcodes
01795         cluster_updateWildcodes(parent, -relativeOffset);
01796 
01797         relativeOffset = 0;
01798     }
01799 
01800     // If child is longer than parent
01801     if (child->length + relativeOffset > parent->length)
01802     {
01803         // Add end of new sequence onto parent
01804                 parent->sequence = (char*)global_realloc(parent->sequence, child->length + relativeOffset);
01805                 memcpy(parent->sequence + parent->length, child->sequence + parent->length - relativeOffset,
01806                child->length - parent->length + relativeOffset);
01807         parent->length = child->length + relativeOffset;
01808     }
01809 
01810 //    printf("[%d:%d:%d]\n", child->length, parent->length, relativeOffset);
01811     // For region that child and parent have in common
01812     count = 0;
01813     while (count < child->length)
01814     {
01815 //      printf("[%d/%d]", count + relativeOffset, parent->length); fflush(stdout);
01816         // If characters don't match
01817                 if (child->sequence[count] != parent->sequence[count + relativeOffset])
01818         {
01819 //              printf("%d: %d,%d\n", count + relativeOffset, child->sequence[count],
01820 //                                  parent->sequence[count + relativeOffset]); fflush(stdout);
01821 
01822                 if (parent->sequence[count + relativeOffset] >= encoding_aaStartWildcards)
01823             {
01824                 // Get parent wildcode
01825                 wildCode = cluster_getWildcode(parent, count + relativeOffset);
01826                         }
01827             else
01828             {
01829                 // Add parent character to new wildcode
01830                     wildCode = 0;
01831                 setbit(wildCode, parent->sequence[count + relativeOffset]);
01832             }
01833 
01834             // If child residue will change wildcode
01835             if (!getbit(wildCode, child->sequence[count]))
01836             {
01837                 // Add child character to wildcode
01838                 setbit(wildCode, child->sequence[count]);
01839                                 cluster_updateWildcode(parent, count + relativeOffset, wildCode);
01840 
01841 //                printf("set %d pos %d = %d\n", parent->number, count + relativeOffset, wildCode);
01842 
01843                 // Check for a match in set of wildcards
01844                 wildcardCount = 0;
01845                 while (wildcardCount < wildcards_numClusterWildcards)
01846                 {
01847                     if ((wildcards_clusterWildcards[wildcardCount].wildCode & wildCode) == wildCode)
01848                     {
01849 //                      printf("Match: ");
01850 //                          wildcards_printWildcard(wildcards_clusterWildcards[wildcardCount].wildCode);
01851                         break;
01852                     }
01853                     wildcardCount++;
01854                 }
01855 
01856                 if (wildcardCount >= wildcards_numClusterWildcards)
01857                 {
01858                     printf("Error %d >= %d\n", wildcardCount, wildcards_numClusterWildcards);
01859                     wildcards_printWildcard(wildCode);
01860                     exit(-1);
01861                 }
01862 
01863                 // Insert wildcard character into parent
01864                 parent->sequence[count + relativeOffset] = encoding_aaStartWildcards + wildcardCount;
01865             }
01866 
01867 //            printf("Inserted wildNum %d into parent\n", wildcardCount);
01868         }
01869         count++;
01870     }
01871 
01872     // Add new child
01873     child->regionStart = relativeOffset;
01874     child->parent = parent;
01875     parent->numChildren++;
01876         parent->children = (struct sequence**)global_realloc(parent->children,
01877                        sizeof(struct sequence*) * parent->numChildren);
01878         parent->children[parent->numChildren - 1] = child;
01879 
01880 //      printf("Added %d: \n", child->number);
01881 //    cluster_printCluster(parent);
01882 }

Here is the call graph for this function:

Here is the caller graph for this function:

float cluster_averageWildcodeScore ( uint4  wildcode  ) 

Definition at line 2091 of file cluster.c.

References cluster_averageWildcodeScores, constants_sentinalScore, encoding_numLetters, getbit, global_malloc(), int4, scoreMatrix::matrix, Robinson_prob, and wildcards_scoreMatrix.

02092 {
02093         unsigned char code, wildResidue;
02094         float averageScore = 0;
02095     int4 bestScore, tempCode;
02096 
02097     // Initialize array to hold precomputed answers
02098     if (cluster_averageWildcodeScores == NULL)
02099     {
02100         cluster_averageWildcodeScores = (float*)global_malloc(sizeof(float)
02101                                       * ceil(pow(2, encoding_numLetters)));
02102                 tempCode = 0;
02103                 while (tempCode < ceil(pow(2, encoding_numLetters)))
02104         {
02105                         cluster_averageWildcodeScores[tempCode] = 0;
02106                 tempCode++;
02107         }
02108     }
02109 
02110     if (cluster_averageWildcodeScores[wildCode] == 0)
02111         {
02112         // For each residue
02113         code = 0;
02114         while (code < encoding_numLetters)
02115         {
02116             // For each residue in wildcode
02117             bestScore = constants_sentinalScore;
02118             wildResidue = 0;
02119             while (wildResidue < encoding_numLetters)
02120             {
02121                 if (getbit(wildCode, wildResidue))
02122                 {
02123                     if (wildcards_scoreMatrix.matrix[code][wildResidue] > bestScore)
02124                         bestScore = wildcards_scoreMatrix.matrix[code][wildResidue];
02125                 }
02126                 wildResidue++;
02127             }
02128 
02129     //        printf("averageScore=%f\n", (float)bestScore);
02130             averageScore += (float)bestScore * (Robinson_prob[code] / 1000.0);
02131 
02132             code++;
02133         }
02134 
02135         cluster_averageWildcodeScores[wildCode] = averageScore;
02136         }
02137 
02138     return cluster_averageWildcodeScores[wildCode];
02139 }

Here is the call graph for this function:

int cluster_buildCluster ( struct wordLocation wordLocations,
uint4  numWordLocations,
struct sequence sequences 
)

Definition at line 1291 of file cluster.c.

References cluster_addChild(), cluster_newParent(), cluster_parents, cluster_PSSM, cluster_score(), global_malloc(), int4, sequence::length, memBlocks_newEntry(), wordLocation::offset, sequence::regionStart, parent::sequence, sequence::sequence, wordLocation::sequenceNumber, uint4, and wildcardsThreshold.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01293 {
01294         uint4 *toCluster, numToCluster, *toCluster2, numToCluster2, *toCluster3;
01295         int4 start, end, column;
01296     int4 score, bestScore, bestLocation;
01297     struct sequence sequence;
01298         uint4 locationNum;
01299     struct parent* parent;
01300     struct wordLocation firstMember;
01301 
01302 //    printf("cluster_buildCluster["); fflush(stdout);
01303 
01304     // Compare sequences to the PSSM
01305     bestScore = 0; bestLocation = 0;
01306     locationNum = 0;
01307     while (locationNum < numWordLocations)
01308     {
01309         if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL)
01310         {
01311             // For each sequence
01312             sequence = sequences[wordLocations[locationNum].sequenceNumber];
01313 
01314             // Find start and end
01315             start = -wordLocations[locationNum].offset;
01316             end = sequence.length - wordLocations[locationNum].offset;
01317 
01318             // Score sequence against PSSM
01319             score = 0;
01320             column = start;
01321             while (column < end)
01322             {
01323                 score += cluster_PSSM[column][sequence.sequence[column - start]];
01324                 column++;
01325             }
01326 
01327             // Check if this sequence is most similiar to PSSM
01328             score = 100 * score / sequence.length;
01329             if (score > bestScore)
01330             {
01331                 bestLocation = locationNum;
01332                 bestScore = score;
01333             }
01334 
01335 //            printf("[%3d] ", score);
01336 //            cluster_printSequence(start - cluster_PSSMstart, sequence.sequence, sequence.length);
01337                 }
01338 
01339         locationNum++;
01340     }
01341 
01342     // Closest to the consensus becomes the first member of the new cluster
01343     parent = (struct parent*)memBlocks_newEntry(cluster_parents);
01344     cluster_newParent(parent, sequences + wordLocations[bestLocation].sequenceNumber);
01345     firstMember = wordLocations[bestLocation];
01346 
01347     // Remove first member from PSSM
01348     sequence = sequences[wordLocations[bestLocation].sequenceNumber];
01349     start = -wordLocations[bestLocation].offset;
01350     end = sequence.length - wordLocations[bestLocation].offset;
01351     column = start;
01352     while (column < end)
01353     {
01354         cluster_PSSM[column][sequence.sequence[column - start]]--;
01355         column++;
01356     }
01357 
01358         toCluster = (uint4*)global_malloc(sizeof(uint4) * numWordLocations);
01359     toCluster2 = (uint4*)global_malloc(sizeof(uint4) * numWordLocations);
01360     numToCluster = 0; numToCluster2 = 0;
01361 
01362     // Initially consider every sequence
01363     locationNum = 0;
01364     while (locationNum < numWordLocations)
01365     {
01366         // If not already a member of a cluster
01367         if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL)
01368         {
01369                 // Add to list to align to new cluster's parent
01370                 toCluster[numToCluster] = locationNum;
01371             numToCluster++;
01372                 }
01373         locationNum++;
01374         }
01375 
01376 //    printf("!"); fflush(stdout);
01377 
01378     // Repeat until no more sequences are similiar enough to the parent
01379     while (numToCluster > 0)
01380         {
01381         // For each sequence still worth considering
01382         bestScore = 0; bestLocation = 0;
01383 //        printf("%d to cluster\n", numToCluster);
01384         while (numToCluster > 0)
01385         {
01386             numToCluster--;
01387             locationNum = toCluster[numToCluster];
01388 
01389                 if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL)
01390                         {
01391                 // Get sequence, start and end
01392                 sequence = sequences[wordLocations[locationNum].sequenceNumber];
01393                 start = -wordLocations[locationNum].offset;
01394                 end = sequence.length - wordLocations[locationNum].offset;
01395 
01396                 // Calculate score for aligning sequence to parent of cluster
01397                 score = cluster_score(sequences + firstMember.sequenceNumber, &sequence,
01398                         firstMember.offset - wordLocations[locationNum].offset,
01399                         wildcardsThreshold);
01400 
01401                 // If above cutoff
01402                 if (score > 0)
01403                 {
01404                     // Add to list
01405                     toCluster2[numToCluster2] = locationNum;
01406                     numToCluster2++;
01407 
01408                     // Calculate highest scoring match
01409                     if (score > bestScore)
01410                     {
01411                         bestScore = score;
01412                         bestLocation = locationNum;
01413                     }
01414 
01415 //                    cluster_printSequence(start - cluster_PSSMstart, sequence.sequence, sequence.length);
01416 //                    printf("Score=%d\n", score);
01417                 }
01418                 }
01419         }
01420 
01421         // If we have matches to our close-to-consensus sequence
01422         if (bestScore > 0)
01423         {
01424             // Add the closest match to the new cluster
01425             cluster_addChild(parent, sequences + wordLocations[bestLocation].sequenceNumber,
01426                              firstMember.offset - wordLocations[bestLocation].offset +
01427                              sequences[firstMember.sequenceNumber].regionStart);
01428 
01429             // Remove new member from PSSM
01430             sequence = sequences[wordLocations[bestLocation].sequenceNumber];
01431             start = -wordLocations[bestLocation].offset;
01432             end = sequence.length - wordLocations[bestLocation].offset;
01433             column = start;
01434             while (column < end)
01435             {
01436                 cluster_PSSM[column][sequence.sequence[column - start]]--;
01437                 column++;
01438             }
01439 
01440             // Switch clusters so processing new list of close matches to parent
01441             toCluster3 = toCluster;
01442             toCluster = toCluster2;
01443             toCluster2 = toCluster3;
01444             numToCluster = numToCluster2;
01445             numToCluster2 = 0;
01446         }
01447 
01448 //      cluster_printCluster(parent);
01449         }
01450 
01451 //    cluster_printCluster(parent);
01452 
01453         free(toCluster);
01454     free(toCluster2);
01455 
01456 //    printf("]\n"); fflush(stdout);
01457 
01458     return 0;
01459 }

Here is the call graph for this function:

Here is the caller graph for this function:

int cluster_buildPSSM ( struct wordLocation wordLocations,
uint4  numWordLocations,
struct sequence sequences 
)

Definition at line 1206 of file cluster.c.

References cluster_PSSM, cluster_PSSMend, cluster_PSSMlength, cluster_PSSMstart, encoding_numCodes, global_malloc(), int4, sequence::length, wordLocation::offset, sequence::sequence, wordLocation::sequenceNumber, and uint4.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01208 {
01209         int4 start, end, column;
01210         uint4 locationNum;
01211     unsigned char code;
01212     struct sequence sequence;
01213 
01214     cluster_PSSMstart = 0;
01215         cluster_PSSMend = 0;
01216 
01217     // Calculate the start and end of consensus region
01218     locationNum = 0;
01219     while (locationNum < numWordLocations)
01220     {
01221         // Check sequences do not already have parents
01222         if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL)
01223         {
01224             start = -wordLocations[locationNum].offset;
01225             if (start < cluster_PSSMstart)
01226                 cluster_PSSMstart = -wordLocations[locationNum].offset;
01227 
01228             end = sequences[wordLocations[locationNum].sequenceNumber].length
01229                 - wordLocations[locationNum].offset;
01230             if (end > cluster_PSSMend)
01231                 cluster_PSSMend = end;
01232                 }
01233 
01234         locationNum++;
01235     }
01236 
01237     if (cluster_PSSMstart == 0 && cluster_PSSMend == 0)
01238         return 1;
01239 
01240 //    printf("consensus start=%d end=%d\n", cluster_PSSMstart, cluster_PSSMend);
01241 
01242     // Initialize the PSSM
01243     cluster_PSSMlength = cluster_PSSMend - cluster_PSSMstart;
01244     cluster_PSSM = (uint4**)global_malloc(sizeof(uint4*) * cluster_PSSMlength);
01245     cluster_PSSM -= cluster_PSSMstart;
01246         column = cluster_PSSMstart;
01247     while (column < cluster_PSSMend)
01248     {
01249         // Initialize each column to zero values
01250         cluster_PSSM[column] = (uint4*)global_malloc(sizeof(uint4) * encoding_numCodes);
01251         code = 0;
01252         while (code < encoding_numCodes)
01253         {
01254                         cluster_PSSM[column][code] = 0;
01255                 code++;
01256         }
01257         column++;
01258     }
01259 
01260     // Calculate residue frequencies at each column
01261     locationNum = 0;
01262     while (locationNum < numWordLocations)
01263     {
01264         if (sequences[wordLocations[locationNum].sequenceNumber].parent == NULL)
01265         {
01266             // For each sequence
01267             sequence = sequences[wordLocations[locationNum].sequenceNumber];
01268 
01269             // Find start and end
01270             start = -wordLocations[locationNum].offset;
01271             end = sequence.length - wordLocations[locationNum].offset;
01272 
01273             // Process each residue in the sequence and update count in PSSM
01274             column = 0;
01275             while (column < sequence.length)
01276             {
01277 //              printf("[%d]", sequence.sequence[column]); fflush(stdout);
01278                 cluster_PSSM[column + start][sequence.sequence[column]]++;
01279                 column++;
01280             }
01281                 }
01282 
01283         locationNum++;
01284     }
01285 
01286     return 0;
01287 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 cluster_calculateSavings (  ) 

Definition at line 1533 of file cluster.c.

References parent::children, cluster_parents, encoding_aaStartWildcards, int4, sequence::length, parent::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), sequence::number, parent::numChildren, sequence::parent, and parent::sequence.

Referenced by main().

01534 {
01535         int4 count, childNum, numCharsSaved, bytesSaved = 0, numWilds;
01536     struct sequence* child;
01537         struct parent* parent;
01538 
01539     // For each parent
01540     memBlocks_resetCurrent(cluster_parents);
01541         while ((parent = memBlocks_getCurrent(cluster_parents)) != NULL)
01542     {
01543         count = 0; numWilds = 0;
01544         while (count < parent->length)
01545         {
01546                         if (parent->sequence[count] >= encoding_aaStartWildcards)
01547                                 numWilds++;
01548             count++;
01549         }
01550 
01551         // If it has children
01552         if (parent->children != NULL)
01553         {
01554             // For each child calculate number of characters saved
01555             numCharsSaved = 0;
01556             childNum = 0;
01557             while (childNum < parent->numChildren)
01558             {
01559                 child = parent->children[childNum];
01560 
01561                 // CHECK: no child has two parents
01562                 if (child->parent != parent)
01563                 {
01564                         printf("Error sequence %d has multiple parents\n", child->number);
01565                     exit(-1);
01566                 }
01567 
01568                 numCharsSaved += child->length;
01569                 childNum++;
01570             }
01571 
01572             // Minus cost of parent
01573             bytesSaved += numCharsSaved - parent->length;
01574 
01575             if (numCharsSaved < parent->length)
01576             {
01577                                 printf("Error parent length %d with %d children saves negative bytes",
01578                         parent->length, parent->numChildren);
01579                 exit(-1);
01580             }
01581         }
01582     }
01583 
01584         return bytesSaved;
01585 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 cluster_checkMerge ( struct wordLocation wordLocations1,
uint4  numWordLocations1,
struct wordLocation wordLocations2,
uint4  numWordLocations2,
int4 *  bestDiagonal 
)

void cluster_clusterSequences ( struct sequence sequences,
uint4  numSequences,
uint4  numberOfLetters,
char *  filename 
)

Definition at line 870 of file cluster.c.

References close_index(), cluster_buildCluster(), cluster_buildPSSM(), cluster_freePSSM(), cluster_overlap(), cluster_processList(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), decoWordLength, listinfo::doc_count, listinfo::doc_numbers, get_next_biglist(), get_next_list(), global_malloc(), int4, diagonal::matchSequence, maxListLength, memSingleBlock_free(), memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_initializeExisting(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, diagonal::numMatches, wordLocation::offset, open_index(), percentMatchesRequired, listinfo::phrase_offsets, diagonal::relativeOffset, reset_index(), wordLocation::sequenceNumber, uint4, and wildcardsThreshold.

00872 {
00873         struct wordLocation* wordLocations;
00874         uint4 numWordLocations;
00875     int4 relativeOffset, sequenceNumber;
00876     uint4 sequence1, sequence2, score;
00877         struct memSingleBlock* diagonals, *sequenceDiagonals;
00878     struct diagonal* diagonalMatch;
00879     struct sequenceMatch *sequenceMatch;
00880     struct memSingleBlock* sequenceMatches;
00881         struct sequence* seq1, *seq2;
00882         struct index_scanner *index_scanner;
00883         struct listinfo* listinfo = NULL;
00884     uint4 offsetCount, sequenceCount;
00885 
00886     // Initialize diagonals
00887     diagonals = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock)
00888                                                       * numberOfSequences);
00889         sequenceNumber = 0;
00890     while (sequenceNumber < numberOfSequences)
00891     {
00892         memSingleBlock_initializeExisting(diagonals + sequenceNumber, sizeof(struct diagonal), 1);
00893         sequenceNumber++;
00894     }
00895 
00896     printf("Initialized diagonals\n"); fflush(stdout);
00897 
00898     // Temporary buffer for storing word locations
00899     wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences);
00900 
00901     index_scanner = open_index(filename);
00902 
00903     // First process the longest lists
00904     while ((listinfo = get_next_biglist(index_scanner, listinfo)) != NULL &&
00905            listinfo->doc_count >= maxListLength)
00906         {
00907         printf("doc_count=%ld\n", listinfo->doc_count); fflush(stdout);
00908 
00909         // For each sequence
00910         numWordLocations = 0;
00911         sequenceCount = 0;
00912         while (sequenceCount < listinfo->doc_count)
00913         {
00914                         // Only consider first occurence in that sequence
00915                         offsetCount = 0;
00916 
00917             wordLocations[numWordLocations].sequenceNumber
00918                 = listinfo->doc_numbers[sequenceCount] - 1;
00919             wordLocations[numWordLocations].offset
00920                 = listinfo->phrase_offsets[sequenceCount][offsetCount] - 1;
00921 
00922             numWordLocations++;
00923             offsetCount++;
00924 
00925                 sequenceCount++;
00926         }
00927 
00928         printf("Was %d\n", numWordLocations); fflush(stdout);
00929 
00930         // Remove any locations refering to already clustered sequences
00931         numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences);
00932 
00933         if (numWordLocations >= maxListLength &&
00934             !cluster_buildPSSM(wordLocations, numWordLocations, sequences))
00935                 {
00936             while (numWordLocations >= maxListLength)
00937             {
00938                 // Build cluster from longest list of word locations
00939                 cluster_buildCluster(wordLocations, numWordLocations, sequences);
00940 
00941                 //      cluster_printList(wordLocations, numWordLocations);
00942 
00943                 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences);
00944                 printf("Now %d\n", numWordLocations); fflush(stdout);
00945             }
00946 
00947             // Free the PSSM
00948             cluster_freePSSM();
00949                 }
00950 
00951         // Process a list of word locations and insert/update diagonal entries
00952         cluster_processList(diagonals, wordLocations, numWordLocations);
00953     }
00954 
00955     // Process remaining shorter lists
00956         index_scanner = reset_index(index_scanner);
00957     while ((listinfo = get_next_list(index_scanner, listinfo)) != NULL)
00958         {
00959         if (listinfo->doc_count < maxListLength)
00960         {
00961             // For each sequence
00962             numWordLocations = 0;
00963             sequenceCount = 0;
00964             while (sequenceCount < listinfo->doc_count)
00965             {
00966                 // Only consider first occurence in that sequence
00967                 offsetCount = 0;
00968 
00969                 wordLocations[numWordLocations].sequenceNumber
00970                     = listinfo->doc_numbers[sequenceCount] - 1;
00971                 wordLocations[numWordLocations].offset
00972                     = listinfo->phrase_offsets[sequenceCount][offsetCount] - 1;
00973 
00974                 offsetCount++;
00975                 numWordLocations++;
00976 
00977                 sequenceCount++;
00978             }
00979 
00980 //            printf("cluster_processList with %d entries\n", numWordLocations);
00981             // Process a list of word locations and insert/update diagonal entries
00982             cluster_processList(diagonals, wordLocations, numWordLocations);
00983         }
00984     }
00985     close_index(index_scanner);
00986 
00987     free(wordLocations);
00988 
00989     printf("END STAGE 2\n"); fflush(stdout);
00990 
00991     sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences);
00992 
00993     printf("\n");
00994     // For each sequence
00995     sequenceNumber = 0;
00996     while (sequenceNumber < numberOfSequences)
00997     {
00998         // Go through list of matching diagonals
00999         sequenceDiagonals = diagonals + sequenceNumber;
01000         memSingleBlock_resetCurrent(sequenceDiagonals);
01001         while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL)
01002         {
01003                 sequence1 = sequenceNumber;
01004             sequence2 = diagonalMatch->matchSequence;
01005             relativeOffset = diagonalMatch->relativeOffset;
01006             seq1 = sequences + sequence1;
01007             seq2 = sequences + sequence2;
01008 
01009             // Calculate rough score based on number of matches, length of overlap region
01010             score = 100 * diagonalMatch->numMatches /
01011                     cluster_overlap(seq1, seq2, relativeOffset) + decoWordLength;
01012 
01013             if (score > percentMatchesRequired)// && relativeOffset == 0 && seq1->length == seq2->length)
01014             {
01015                 // Compare the two sequences in detail
01016                 score = cluster_score(seq1, seq2, relativeOffset, wildcardsThreshold);
01017                 if (score > 0)
01018                 {
01019 //                    printf("Sequences %d,%d lengths %d,%d offset %d matches=%d score=%d\n",
01020 //                           sequence1, sequence2, sequences[sequence1].length, sequences[sequence2].length,
01021 //                           relativeOffset, diagonalMatch->numMatches, score);
01022 
01023                     // Record high-scoring match between pair of sequences
01024                     sequenceMatch = memSingleBlock_newEntry(sequenceMatches);
01025                     sequenceMatch->sequence1 = sequence1;
01026                     sequenceMatch->sequence2 = sequence2;
01027                     sequenceMatch->relativeOffset = relativeOffset;
01028                     sequenceMatch->score = score;
01029                 }
01030             }
01031 
01032         }
01033         sequenceNumber++;
01034     }
01035 
01036     printf("END STAGE 3 numSequenceMatches=%d\n", sequenceMatches->numEntries); fflush(stdout);
01037 
01038     // Free memory used to store diagonal matches
01039         sequenceNumber = 0;
01040     while (sequenceNumber < numberOfSequences)
01041     {
01042         free(diagonals[sequenceNumber].block);
01043         sequenceNumber++;
01044     }
01045     free(diagonals);
01046 
01047     // Process sequence matches
01048     cluster_processSequenceMatches(sequenceMatches, sequences);
01049 
01050     memSingleBlock_free(sequenceMatches);
01051 }

Here is the call graph for this function:

int4 cluster_compareScore ( const void *  sequenceMatch1,
const void *  sequenceMatch2 
)

Definition at line 1885 of file cluster.c.

References sequenceMatch::score.

Referenced by cluster_processSequenceMatches().

01886 {
01887         const struct sequenceMatch *a1, *a2;
01888 
01889         a1 = (struct sequenceMatch*)sequenceMatch1;
01890         a2 = (struct sequenceMatch*)sequenceMatch2;
01891 
01892         if (a1->score > a2->score)
01893         {
01894                 return -1;
01895         }
01896         else if (a1->score < a2->score)
01897         {
01898                 return 1;
01899         }
01900         else
01901         {
01902                 return 0;
01903         }
01904 }

Here is the caller graph for this function:

int4 cluster_compareSequenceLengths ( const void *  sequence1,
const void *  sequence2 
)

Definition at line 432 of file cluster.c.

References sequence::length.

Referenced by cluster_simpleClusterSequences().

00433 {
00434         const struct sequence *a1, *a2;
00435 
00436         a1 = (struct sequence*)sequence1;
00437         a2 = (struct sequence*)sequence2;
00438 
00439         if (a1->length > a2->length)
00440         {
00441                 return 1;
00442         }
00443         else if (a1->length < a2->length)
00444         {
00445                 return -1;
00446         }
00447         else
00448         {
00449                 return 0;
00450         }
00451 }

Here is the caller graph for this function:

void cluster_freePSSM (  ) 

Definition at line 1462 of file cluster.c.

References cluster_PSSM, cluster_PSSMend, cluster_PSSMstart, and int4.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01463 {
01464         int4 column;
01465 
01466     column = cluster_PSSMstart;
01467     while (column < cluster_PSSMend)
01468     {
01469         free(cluster_PSSM[column]);
01470         column++;
01471     }
01472     cluster_PSSM += cluster_PSSMstart;
01473     free(cluster_PSSM);
01474 }

Here is the caller graph for this function:

struct edit * cluster_getEdits ( struct parent parent,
struct sequence child,
uint4 *  numEdits 
) [read]

Definition at line 409 of file cluster.c.

References edit::code, global_malloc(), sequence::length, edit::position, sequence::regionStart, sequence::sequence, parent::sequence, and uint4.

Referenced by cluster_writeClusters().

00410 {
00411         uint4 count = 0;
00412         struct edit* edits;
00413 
00414     *numEdits = 0;
00415     edits = (struct edit*)global_malloc(sizeof(struct edit) * child->length);
00416 
00417     while (count < child->length)
00418     {
00419         if (parent->sequence[count + child->regionStart] != child->sequence[count])
00420         {
00421                 edits[*numEdits].position = count;
00422             edits[*numEdits].code = child->sequence[count];
00423             (*numEdits)++;
00424         }
00425                 count++;
00426     }
00427 
00428     return edits;
00429 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 cluster_getWildcode ( struct parent parent,
uint4  position 
)

Definition at line 1676 of file cluster.c.

References parent::number, sequence::parent, wildCode::position, uint4, wildCode::wildCode, and parent::wildCodes.

Referenced by cluster_addChild(), and cluster_score().

01677 {
01678         uint4 count = 0;
01679 
01680     while (count < parent->numWildcards)
01681     {
01682                 if (parent->wildCodes[count].position == position)
01683                 return parent->wildCodes[count].wildCode;
01684         count++;
01685     }
01686 
01687     fprintf(stderr, "Internal error cluster_getWildcode(%d, %d)\n", parent->number, position);
01688     exit(-1);
01689 }

Here is the caller graph for this function:

struct wordLocation* cluster_mergeLists ( struct wordLocation wordLocations1,
uint4 *  numWordLocations1,
struct wordLocation wordLocations2,
uint4 *  numWordLocations2,
int4  mergeRelativeOffset,
uint *  lengthNewList 
) [read]

void cluster_mergeParents ( struct parent parent1,
struct parent parent2,
int4  relativeOffset 
)

Definition at line 1717 of file cluster.c.

References parent::children, cluster_addChild(), cluster_updateWildcode(), global_realloc(), int4, parent::length, parent::numChildren, parent::numWildcards, wildCode::position, sequence::regionStart, parent::sequence, uint4, wildCode::wildCode, and parent::wildCodes.

Referenced by cluster_processSequenceMatches().

01718 {
01719         int4 childNum;
01720     struct sequence* child;
01721         struct parent* temp;
01722         uint4 wildcodeCount = 0;
01723 
01724     // Arrange so that parent1 comes before parent2
01725     if (relativeOffset < 0)
01726     {
01727         temp = parent1;
01728         parent1 = parent2;
01729         parent2 = temp;
01730                 relativeOffset = -relativeOffset;
01731         }
01732 
01733     // If parent2 is longer than parent1
01734     if (parent2->length + relativeOffset > parent1->length)
01735     {
01736         // Add end of parent2 onto parent1
01737                 parent1->sequence = (char*)global_realloc(parent1->sequence, parent2->length + relativeOffset);
01738                 memcpy(parent1->sequence + parent1->length, parent2->sequence + parent1->length - relativeOffset,
01739                parent2->length - parent1->length + relativeOffset);
01740         parent1->length = parent2->length + relativeOffset;
01741 
01742         while (wildcodeCount < parent2->numWildcards)
01743         {
01744                         cluster_updateWildcode(parent1, parent2->wildCodes[wildcodeCount].position + relativeOffset,
01745                                    parent2->wildCodes[wildcodeCount].wildCode);
01746                         wildcodeCount++;
01747         }
01748     }
01749 
01750     // For each child in parent2
01751     childNum = 0;
01752     while (childNum < parent2->numChildren)
01753     {
01754         // Add child to parent1
01755         child = parent2->children[childNum];
01756                 cluster_addChild(parent1, child, relativeOffset + child->regionStart);
01757 
01758         childNum++;
01759     }
01760 
01761     // parent2 no longer has children
01762     free(parent2->children);
01763     parent2->children = NULL;
01764     free(parent2->sequence);
01765         parent2->sequence = NULL;
01766     parent2->length = 0;
01767         parent2->numChildren = 0;
01768 }

Here is the call graph for this function:

Here is the caller graph for this function:

void cluster_newParent ( struct parent newParent,
struct sequence child 
)

Definition at line 1644 of file cluster.c.

References parent::children, cluster_numParents, global_malloc(), sequence::length, parent::length, parent::number, parent::numChildren, parent::numWildcards, sequence::parent, sequence::regionStart, sequence::sequence, parent::sequence, and parent::wildCodes.

Referenced by cluster_buildCluster(), and cluster_processSequenceMatches().

01645 {
01646         // Make copy of the child
01647     parent->number = cluster_numParents; cluster_numParents++;
01648         parent->length = child->length;
01649         parent->sequence = (char*)global_malloc(parent->length);
01650     memcpy(parent->sequence, child->sequence, child->length);
01651 
01652     // Initialize list of wildcards
01653     parent->wildCodes = NULL;
01654     parent->numWildcards = 0;
01655 
01656     // Link parent to child, and vice versa
01657     parent->numChildren = 1;
01658     parent->children = (struct sequence**)global_malloc(sizeof(struct sequence*));
01659     parent->children[0] = child;
01660     child->regionStart = 0;
01661     child->parent = parent;
01662 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 cluster_overlap ( struct sequence sequence1,
struct sequence sequence2,
int4  relativeOffset 
)

Definition at line 1624 of file cluster.c.

References sequence::length, and uint4.

Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().

01625 {
01626         uint4 start1, end1;
01627 
01628     // Find start of match region
01629     if (relativeOffset > 0)
01630         start1 = relativeOffset;
01631     else
01632         start1 = 0;
01633 
01634     // Find end of match region
01635     if (sequence1->length < sequence2->length + relativeOffset)
01636         end1 = sequence1->length;
01637     else
01638         end1 = sequence2->length + relativeOffset;
01639 
01640     return end1 - start1;
01641 }

Here is the caller graph for this function:

void cluster_printCluster ( struct parent parent  ) 

Definition at line 1588 of file cluster.c.

References parent::children, int4, sequence::length, parent::length, sequence::number, parent::number, parent::numChildren, sequence::parent, print_singleSequence(), sequence::regionStart, sequence::sequence, and parent::sequence.

01589 {
01590         int4 count, childNum, numCharsSaved;
01591     struct sequence* child;
01592 
01593     printf("### Parent %d with %d children ###\nParent    ", parent->number, parent->numChildren);
01594         print_singleSequence(parent->sequence, parent->length); printf("\n");
01595 
01596     // For each child
01597     numCharsSaved = 0;
01598     childNum = 0;
01599     while (childNum < parent->numChildren)
01600     {
01601         child = parent->children[childNum];
01602 
01603         printf("[%7d] ", child->number);
01604         // Print spaces to align child to parent
01605         count = 0;
01606         while (count < child->regionStart)
01607         {
01608             printf(" ");
01609             count++;
01610         }
01611 
01612         // Print child
01613         print_singleSequence(child->sequence, child->length); printf("\n");
01614 //              printf("Child Length=%d start=%d\n", child->length, child->regionStart);
01615 
01616         numCharsSaved += child->length;
01617         childNum++;
01618     }
01619     printf("%d bytes saved\n", numCharsSaved - parent->length);
01620     printf("\n");
01621 }

Here is the call graph for this function:

void cluster_printList ( struct wordLocation list,
uint4  length 
)

Definition at line 1520 of file cluster.c.

References wordLocation::offset, wordLocation::sequenceNumber, and uint4.

01521 {
01522         uint4 count = 0;
01523 
01524     while (count < length)
01525     {
01526         printf("[%d,%d] ", list[count].sequenceNumber, list[count].offset);
01527         count++;
01528     }
01529     printf(".\n");
01530 }

void cluster_printSequence ( uint4  whitespace,
unsigned char *  sequence,
uint4  length 
)

Definition at line 1507 of file cluster.c.

References print_singleSequence().

01508 {
01509     // Print whitespace before sequence begins
01510     while (whitespace > 0)
01511     {
01512         printf(" ");
01513         whitespace--;
01514     }
01515 
01516     print_singleSequence(sequence, length); printf("\n");
01517 }

Here is the call graph for this function:

void cluster_processList ( struct memSingleBlock diagonals,
struct wordLocation wordLocations,
uint4  numWordLocations 
)

Definition at line 1133 of file cluster.c.

References int4, diagonal::matchSequence, memSingleBlock_getCurrent(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), diagonal::numMatches, wordLocation::offset, diagonal::relativeOffset, wordLocation::sequenceNumber, and uint4.

Referenced by cluster_clusterSequences(), and cluster_spexClusterSequences().

01135 {
01136         uint4 count1, count2, numNewDiagonalMatches = 0;
01137     struct wordLocation location1, location2;
01138         int4 relativeOffset;
01139     char matchFound;
01140     struct memSingleBlock *sequenceDiagonals;
01141     struct diagonal* diagonalMatch;
01142 
01143     // For each pair of locations
01144     numNewDiagonalMatches = 0;
01145     count1 = 0;
01146     while (count1 < numWordLocations)
01147     {
01148         count2 = count1 + 1;
01149         while (count2 < numWordLocations)
01150         {
01151             // location1 < location2
01152             if (wordLocations[count1].sequenceNumber < wordLocations[count2].sequenceNumber)
01153             {
01154                 location1 = wordLocations[count1];
01155                 location2 = wordLocations[count2];
01156             }
01157             else
01158             {
01159                 location2 = wordLocations[count1];
01160                 location1 = wordLocations[count2];
01161             }
01162 
01163             // If the locations don't both refer to the same sequence
01164             if (location1.sequenceNumber != location2.sequenceNumber)
01165             {
01166                 // Search list of matching diagonals for this sequence for one that is the
01167                 // same diagonal and matching sequence as the new match
01168                 relativeOffset = location1.offset - location2.offset;
01169                 matchFound = 0;
01170                 sequenceDiagonals = diagonals + location1.sequenceNumber;
01171                 memSingleBlock_resetCurrent(sequenceDiagonals);
01172                 while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL)
01173                 {
01174                     if (diagonalMatch->relativeOffset == relativeOffset &&
01175                         diagonalMatch->matchSequence == location2.sequenceNumber)
01176                     {
01177                         // If we found an identinal entry, increment number of matches
01178                         matchFound = 1;
01179                         diagonalMatch->numMatches++;
01180                         break;
01181                     }
01182                 }
01183 
01184                 // Otherwise create a new entry, ONLY if neither sequence1 or 2 belong to a cluster
01185                 if (!matchFound)// && sequences[wordLocations[count1].sequenceNumber].parent == NULL
01186                                 // && sequences[wordLocations[count2].sequenceNumber].parent == NULL)
01187                 {
01188                     diagonalMatch = memSingleBlock_newEntry(sequenceDiagonals);
01189                     diagonalMatch->relativeOffset = relativeOffset;
01190                     diagonalMatch->matchSequence = location2.sequenceNumber;
01191                     diagonalMatch->numMatches = 1;
01192                     numNewDiagonalMatches++;
01193                 }
01194             }
01195 
01196             count2++;
01197         }
01198         count1++;
01199     }
01200 
01201 //    printf("listLength=%d numNewDiagonalMatches=%d\n", numWordLocations,
01202 //           numNewDiagonalMatches);
01203 }

Here is the call graph for this function:

Here is the caller graph for this function:

void cluster_processSequenceMatches ( struct memSingleBlock sequenceMatches,
struct sequence sequences 
)

Definition at line 1054 of file cluster.c.

References memSingleBlock::block, cluster_addChild(), cluster_compareScore(), cluster_mergeParents(), cluster_newParent(), cluster_parents, cluster_score(), int4, memBlocks_newEntry(), memSingleBlock_getCurrent(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, sequence::parent, sequence::regionStart, and wildcardsThreshold.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01055 {
01056     struct sequenceMatch *sequenceMatch;
01057         struct sequence *seq1, *seq2;
01058     struct parent* parent;
01059     int4 score, temp;
01060 
01061     printf("Performing hierarchical clustering..."); fflush(stdout);
01062 
01063     // Sort the matches by score
01064     qsort(sequenceMatches->block, sequenceMatches->numEntries, sizeof(struct sequenceMatch),
01065           cluster_compareScore);
01066 
01067     // For each match
01068         memSingleBlock_resetCurrent(sequenceMatches);
01069     while ((sequenceMatch = memSingleBlock_getCurrent(sequenceMatches)) != NULL)
01070     {
01071         // If second sequence is a parent, reverse sequences
01072         if (sequences[sequenceMatch->sequence2].parent != NULL)
01073         {
01074                         temp = sequenceMatch->sequence1;
01075             sequenceMatch->sequence1 = sequenceMatch->sequence2;
01076             sequenceMatch->sequence2 = temp;
01077 
01078             sequenceMatch->relativeOffset *= -1;
01079         }
01080 
01081 //      printf("sequence match %d,%d\n", sequenceMatch->sequence1, sequenceMatch->sequence2);
01082 
01083         seq1 = sequences + sequenceMatch->sequence1;
01084         seq2 = sequences + sequenceMatch->sequence2;
01085 
01086         //printf("[%d,%d] ", seq1->number, seq2->number);
01087         // Case 1 - neither sequence has a parent
01088                 if (seq1->parent == NULL && seq2->parent == NULL)
01089         {
01090         //      printf("New cluster\n"); fflush(stdout);
01091 
01092 //              printf("New parent\n");
01093             parent = (struct parent*)memBlocks_newEntry(cluster_parents);
01094             cluster_newParent(parent, seq1);
01095             cluster_addChild(parent, seq2, sequenceMatch->relativeOffset);
01096         }
01097         // Case 2 - sequence1 has a parent, sequence2 does not
01098                 else if (seq1->parent != NULL && seq2->parent == NULL)
01099         {
01100         //      printf("Add to %d\n", seq1->parent->numChildren); fflush(stdout);
01101 
01102             // Check score for aligning sequence to parent
01103                 sequenceMatch->relativeOffset += seq1->regionStart;
01104             score = cluster_score(seq1, seq2, sequenceMatch->relativeOffset, wildcardsThreshold);
01105 //                      printf("[%d][%d,%d,%d]\n", score, seq1->parent->length, seq2->length, sequenceMatch->relativeOffset);
01106 
01107             if (score > 0)
01108             {
01109 //                printf("Add to existing\n");
01110                 cluster_addChild(seq1->parent, seq2, sequenceMatch->relativeOffset);
01111           //    printf("!"); fflush(stdout);
01112             }
01113         }
01114         // Case 3 - both sequences have parents
01115                 else if (seq1->parent != seq2->parent)
01116         {
01117         //      printf("Merge %d and %d\n", seq1->parent->numChildren, seq2->parent->numChildren); fflush(stdout);
01118 
01119                 sequenceMatch->relativeOffset += seq1->regionStart;
01120                 sequenceMatch->relativeOffset -= seq2->regionStart;
01121             score = cluster_score(seq1, seq2, sequenceMatch->relativeOffset, wildcardsThreshold);
01122             if (score > 0)
01123             {
01124                 cluster_mergeParents(seq1->parent, seq2->parent, sequenceMatch->relativeOffset);
01125                         }
01126         }
01127     }
01128 
01129     printf("done.\n"); fflush(stdout);
01130 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 cluster_removeChildren ( struct wordLocation wordLocations,
uint4  numWordLocations,
struct sequence sequences 
)

Definition at line 1478 of file cluster.c.

References global_malloc(), sequence::parent, sequence::sequence, wordLocation::sequenceNumber, and uint4.

Referenced by cluster_clusterSequences(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01480 {
01481         struct wordLocation *wordLocations2;
01482     uint4 numWordLocations2, locationNum;
01483     struct sequence sequence;
01484 
01485     wordLocations2 = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numWordLocations);
01486     numWordLocations2 = 0;
01487     locationNum = 0;
01488     while (locationNum < numWordLocations)
01489     {
01490         // If not a member of a cluster
01491         sequence = sequences[wordLocations[locationNum].sequenceNumber];
01492         if (sequence.parent == NULL)
01493         {
01494                 // Add to new list of word locations
01495                         wordLocations2[numWordLocations2] = wordLocations[locationNum];
01496             numWordLocations2++;
01497         }
01498         locationNum++;
01499         }
01500 
01501     // Return new pruned list of word locations
01502         memcpy(wordLocations, wordLocations2, sizeof(struct wordLocation) * numWordLocations2);
01503     free(wordLocations2);
01504     return numWordLocations2;
01505 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint4 cluster_score ( struct sequence seq1,
struct sequence seq2,
int4  relativeOffset,
float  wildscoreAllowed 
)

Definition at line 1907 of file cluster.c.

References clusterWildcard::averageScore, cluster_averageMatchScore, cluster_getWildcode(), encoding_aaStartWildcards, getbit, int4, parent::length, sequence::length, sequence::parent, sequence::regionStart, parent::sequence, sequence::sequence, sequenceMatch::sequence1, sequenceMatch::sequence2, setbit, uint4, wildcards_clusterWildcards, wildcards_numClusterWildcards, wildcardSumWindow, and clusterWildcard::wildCode.

Referenced by cluster_buildCluster(), cluster_clusterSequences(), cluster_processSequenceMatches(), cluster_simpleClusterSequences(), and cluster_spexClusterSequences().

01909 {
01910         int4 pos1, pos2, start1, end1, start2, end2;
01911     float sumWildscores = 0, windowWildscoreAllowed, sumWindowWildscores = 0;
01912     uint4 wildCode, wildcardCount;
01913     unsigned char* sequence1, *sequence2;
01914     int4 length1, length2;
01915     struct parent* parent1 = NULL, *parent2 = NULL;
01916 
01917     // Check if sequence1 has a parent
01918     if (seq1->parent == NULL)
01919     {
01920         sequence1 = seq1->sequence;
01921         length1 = seq1->length;
01922     }
01923     else
01924     {
01925         parent1 = seq1->parent;
01926         sequence1 = seq1->parent->sequence;
01927         length1 = seq1->parent->length;
01928         relativeOffset += seq1->regionStart;
01929     }
01930 
01931     // Check if sequence2 has a parent
01932     if (seq2->parent == NULL)
01933     {
01934         sequence2 = seq2->sequence;
01935         length2 = seq2->length;
01936     }
01937     else
01938     {
01939         parent2 = seq2->parent;
01940         sequence2 = seq2->parent->sequence;
01941         length2 = seq2->parent->length;
01942         relativeOffset -= seq2->regionStart;
01943     }
01944 
01945     // Find start of match region
01946     if (relativeOffset > 0)
01947     {
01948         start1 = relativeOffset;
01949         start2 = 0;
01950     }
01951     else
01952     {
01953         start1 = 0;
01954         start2 = -relativeOffset;
01955     }
01956 
01957     // Find end of match region
01958     if (length1 < length2 + relativeOffset)
01959     {
01960         end1 = length1;
01961         end2 = length1 - relativeOffset;
01962     }
01963     else
01964     {
01965         end1 = length2 + relativeOffset;
01966         end2 = length2;
01967     }
01968 
01969     // Parents do not overlap, return zero
01970     if (relativeOffset > length1 || -relativeOffset > length2)
01971     {
01972 //      printf("ro=%d\n", relativeOffset);
01973 //      printf("[%d,%d,%d]\n", start1, end1, length1);
01974 //      printf("[%d,%d,%d]\n", start2, end2, length2);
01975         return 0;
01976     }
01977 
01978         pos1 = start1;
01979     pos2 = start2;
01980 
01981     windowWildscoreAllowed = wildscoreAllowed * wildcardSumWindow;
01982     wildscoreAllowed *= (end1 - start1);
01983 
01984     // For each position where the two sequences overlap
01985     while (pos1 < end1)
01986     {
01987         // If the sequences differ
01988                 if (sequence1[pos1] != sequence2[pos2])
01989         {
01990             // Build a wildcode for this position
01991             wildCode = 0;
01992 
01993 //            printf("%c != %c p1=%p p2=%p\n", encoding_getLetter(sequence1[pos1]),
01994 //                   encoding_getLetter(sequence2[pos2]), parent1, parent2);
01995 
01996                         // Sequence1 char is included in parent2's wildcard
01997                         if (parent1 == NULL && parent2 != NULL &&
01998                 sequence2[pos2] >= encoding_aaStartWildcards &&
01999                 getbit(wildcards_clusterWildcards[sequence2[pos2] -
02000                 encoding_aaStartWildcards].wildCode, sequence1[pos1]))
02001                         {
02002                 wildCode = wildcards_clusterWildcards[sequence2[pos2] -
02003                            encoding_aaStartWildcards].wildCode;
02004             }
02005                         // Sequence2 char is included in parent1's wildcard
02006                         else if (parent2 == NULL && parent1 != NULL &&
02007                 sequence1[pos1] >= encoding_aaStartWildcards &&
02008                 getbit(wildcards_clusterWildcards[sequence1[pos1] -
02009                 encoding_aaStartWildcards].wildCode, sequence2[pos2]))
02010                         {
02011                 wildCode = wildcards_clusterWildcards[sequence1[pos1] -
02012                            encoding_aaStartWildcards].wildCode;
02013             }
02014                         else
02015             {
02016                 // If we are processing a single sequence (1)
02017                 if (parent1 == NULL || sequence1[pos1] < encoding_aaStartWildcards)
02018                 {
02019                     setbit(wildCode, sequence1[pos1]);
02020                 }
02021                 // Else we are processing a parent (1)
02022                 else
02023                 {
02024                     wildCode |= cluster_getWildcode(parent1, pos1);
02025                 }
02026 
02027                 // If we a processing a single sequence (2)
02028                 if (parent2 == NULL || sequence2[pos2] < encoding_aaStartWildcards)
02029                 {
02030                     setbit(wildCode, sequence2[pos2]);
02031                 }
02032                 // Else we are processing a parent (2)
02033                 else
02034                 {
02035                     wildCode |= cluster_getWildcode(parent2, pos2);
02036                 }
02037                         }
02038 
02039             //wildCodeAverageScore = cluster_averageWildcodeScore(wildCode);
02040             //wildCodeAverageScore = 0;
02041             //wildcards_printWildcard(wildCode);
02042                         //printf("Average score=%f\n", wildCodeAverageScore);
02043 
02044             // Check for a match in set of wildcards
02045             wildcardCount = 0;
02046             while (wildcardCount < wildcards_numClusterWildcards)
02047             {
02048                 if ((wildcards_clusterWildcards[wildcardCount].wildCode & wildCode) == wildCode)
02049                 {
02050 //                      printf("Match: ");
02051 //                          wildcards_printWildcard(wildcards_clusterWildcards[wildcardCount].wildCode);
02052                     break;
02053                 }
02054                 wildcardCount++;
02055             }
02056 
02057             sumWildscores += wildcards_clusterWildcards[wildcardCount].averageScore
02058                            - cluster_averageMatchScore;
02059             sumWindowWildscores += wildcards_clusterWildcards[wildcardCount].averageScore
02060                                  - cluster_averageMatchScore;
02061 
02062 /*            // Add wild's average score to tally
02063             if (wildcards_clusterWildcards[wildcardCount].averageScore > 0)
02064             {
02065                 sumWildscores += wildcards_clusterWildcards[wildcardCount].averageScore;
02066                 sumWindowWildscores += wildcards_clusterWildcards[wildcardCount].averageScore;
02067                         }*/
02068 
02069             // Stop when more than allowed total average score of wilds
02070             if (sumWildscores > wildscoreAllowed)
02071                 return 0;
02072 
02073             if (sumWindowWildscores > windowWildscoreAllowed)
02074                 return 0;
02075         }
02076 
02077         // Reset the window sum at the end of each window
02078         if (pos1 % wildcardSumWindow == 0)
02079                         sumWindowWildscores = 0;
02080 
02081         pos1++;
02082         pos2++;
02083     }
02084 
02085     // Return score
02086 //    return (end1 - start1) * (wildscoreAllowed - sumWildscores) / wildscoreAllowed;
02087     return 2000 - (1000 * sumWildscores / (end1 - start1));
02088 }

Here is the call graph for this function:

Here is the caller graph for this function:

void cluster_simpleClusterSequences ( struct sequence sequences,
uint4  numberOfSequences,
uint4  numberOfLetters 
)

Definition at line 767 of file cluster.c.

References cluster_buildCluster(), cluster_buildPSSM(), cluster_compareSequenceLengths(), cluster_freePSSM(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), global_malloc(), sequence::length, maxListLength, memSingleBlock_free(), memSingleBlock_initialize(), memSingleBlock_newEntry(), wordLocation::offset, wordLocation::sequenceNumber, uint4, and wildcardsThreshold.

00769 {
00770     uint4 sequenceLength = 0, sequenceCount, startGroup, endGroup, numWordLocations;
00771     struct sequence* sequence, *seq1, *seq2;
00772         struct wordLocation* wordLocations;
00773     uint4 count1, count2, score;
00774     struct memSingleBlock* sequenceMatches;
00775     struct sequenceMatch *sequenceMatch;
00776 
00777     // Temporary buffer for storing word locations
00778     wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences);
00779 
00780     // Sort sequences by length
00781     qsort(sequences, numberOfSequences, sizeof(struct sequence),
00782           cluster_compareSequenceLengths);
00783 
00784     sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences);
00785 
00786     // For each sequence in order of length
00787     sequenceCount = 0;
00788     while (sequenceCount < numberOfSequences)
00789     {
00790         sequence = sequences + sequenceCount;
00791         sequenceLength = sequence->length;
00792         startGroup = sequenceCount;
00793 
00794         // Find end of group of sequences with same length
00795         while (sequenceCount < numberOfSequences)
00796         {
00797             sequence = sequences + sequenceCount;
00798 
00799             if (sequence->length != sequenceLength)
00800             {
00801                 break;
00802             }
00803             sequenceCount++;
00804                 }
00805         endGroup = sequenceCount;
00806 
00807         numWordLocations = 0;
00808         while (numWordLocations < endGroup - startGroup)
00809         {
00810                         wordLocations[numWordLocations].sequenceNumber = startGroup + numWordLocations;
00811                         wordLocations[numWordLocations].offset = 0;
00812                 numWordLocations++;
00813         }
00814 
00815         if (numWordLocations >= maxListLength &&
00816             !cluster_buildPSSM(wordLocations, numWordLocations, sequences))
00817                 {
00818             while (numWordLocations >= maxListLength)
00819             {
00820                 // Build cluster from longest list of word locations
00821                 cluster_buildCluster(wordLocations, numWordLocations, sequences);
00822 
00823                 // cluster_printList(wordLocations, numWordLocations);
00824 
00825                 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences);
00826 //                printf("Now %d\n", numWordLocations); fflush(stdout);
00827             }
00828 
00829             // Free the PSSM
00830             cluster_freePSSM();
00831                 }
00832 
00833         // For each pair of remaining sequences
00834                 count1 = 0;
00835         while (count1 < numWordLocations)
00836         {
00837             count2 = count1 + 1;
00838             while (count2 < numWordLocations)
00839             {
00840                                 seq1 = sequences + wordLocations[count1].sequenceNumber;
00841                                 seq2 = sequences + wordLocations[count2].sequenceNumber;
00842 
00843                 // Compare the two sequences in detail
00844                 score = cluster_score(seq1, seq2, 0, wildcardsThreshold);
00845 
00846                 if (score > 0)
00847                 {
00848                     // Record high-scoring match between pair of sequences
00849                     sequenceMatch = memSingleBlock_newEntry(sequenceMatches);
00850                     sequenceMatch->sequence1 = wordLocations[count1].sequenceNumber;
00851                     sequenceMatch->sequence2 = wordLocations[count2].sequenceNumber;
00852                     sequenceMatch->relativeOffset = 0;
00853                     sequenceMatch->score = score;
00854                 }
00855 
00856                 count2++;
00857                         }
00858             count1++;
00859                 }
00860         printf("length=%d num=%d\n", sequence->length, endGroup - startGroup);
00861     }
00862 
00863     // Process sequence matches
00864     cluster_processSequenceMatches(sequenceMatches, sequences);
00865 
00866     memSingleBlock_free(sequenceMatches);
00867 }

Here is the call graph for this function:

void cluster_spexClusterSequences ( struct sequence sequences,
uint4  numberOfSequences,
uint4  numberOfLetters 
)

Definition at line 453 of file cluster.c.

References memSingleBlock::block, cluster_buildCluster(), cluster_buildPSSM(), cluster_freePSSM(), cluster_overlap(), cluster_processList(), cluster_processSequenceMatches(), cluster_removeChildren(), cluster_score(), decoQantum, decoWordLength, global_malloc(), hashcounter_count(), hashcounter_free(), hashcounter_hash, hashcounter_insert(), hashcounter_multiple(), hashcounter_new(), hashcounter_single(), int4, sequence::length, diagonal::matchSequence, maxListLength, memSingleBlock_free(), memSingleBlock_getCurrent(), memSingleBlock_initialize(), memSingleBlock_initializeExisting(), memSingleBlock_newEntry(), memSingleBlock_resetCurrent(), memSingleBlock::numEntries, numHashtablePasses, numIterations, diagonal::numMatches, percentMatchesRequired, postings_addEntry(), postings_decodeList(), postings_getSortedLists(), postings_initialize(), postings_numLists, diagonal::relativeOffset, sequence::sequence, uint4, and wildcardsThreshold.

Referenced by main().

00455 {
00456         Hashcounter *hashCounter1 = NULL, *hashCounter2 = NULL;
00457     uint4 words, duplicates, iteration = 0, seed, wordLength;
00458     uint4 sequenceNumber, sequenceSize, offset, suboffset;
00459         unsigned char* sequence, *word;
00460     uint4 numHashtablePasses = 50, hashtablePass = 0, passhash;
00461         uint4 hashCounterSize = 1, sinceChange, submatches;
00462     int4 key, numWordLocations;
00463     struct wordLocation* wordLocations;
00464     int4 relativeOffset;
00465     uint4 sequence1, sequence2, score;
00466         struct memSingleBlock* diagonals, *sequenceDiagonals;
00467     struct diagonal* diagonalMatch;
00468     struct sequenceMatch *sequenceMatch;
00469     struct memSingleBlock* sequenceMatches;
00470         struct sequence* seq1, *seq2;
00471     int4 time;
00472     struct finalList* finalLists;
00473     uint4 hashValue1, hashValue2, position, lookup;
00474 
00475     // Calculate size of hashcounter
00476     while (pow(2,hashCounterSize) < numberOfLetters / decoQantum)
00477     {
00478         hashCounterSize++;
00479     }
00480     hashCounter2 = hashcounter_new(hashCounterSize, 0);
00481 
00482     wordLength = decoWordLength - ((numIterations - 1) * decoQantum);
00483 
00484     while (iteration < numIterations)
00485     {
00486         time = -clock();
00487         words = 0; duplicates = 0;
00488         sinceChange = 0;
00489 
00490         // For each sequence
00491         sequenceNumber = 0;
00492         while (sequenceNumber < numberOfSequences)
00493         {
00494             sequence = sequences[sequenceNumber].sequence;
00495             sequenceSize = sequences[sequenceNumber].length;
00496 
00497             // For each sequence longer enough that hasn't been excluded
00498             if (sequenceSize >= wordLength)
00499             {
00500                 // For each word
00501                 offset = 0;
00502                 while (offset < sequenceSize - wordLength)
00503                 {
00504                     // If word has occured before
00505                     word = sequence + offset;
00506                     hashcounter_hash(hashCounter2, word, wordLength, hashValue2, position);
00507                     lookup = hashcounter_single(hashCounter2, hashValue2);
00508 
00509                     if (lookup)
00510                         sinceChange = 0;
00511 
00512                     if (lookup == 1)
00513                     {
00514                         // If it has only occured once, now it has occured twice
00515                         hashcounter_insert(hashCounter2, hashValue2);
00516                     }
00517                     else
00518                     {
00519                                                 submatches = 0;
00520                         // First iteration
00521                         if (hashCounter1 != NULL)
00522                         {
00523                                 suboffset = offset;
00524                                 while (suboffset <= offset + decoQantum)
00525                             {
00526                                 word = sequence + suboffset;
00527                                 hashcounter_hash(hashCounter1, word, wordLength - decoQantum, hashValue1, position);
00528                                 if (hashcounter_multiple(hashCounter1, hashValue1))
00529                                         submatches++;
00530                                                                 suboffset++;
00531                             }
00532                         }
00533                         if (sinceChange >= decoQantum && (hashCounter1 == NULL || submatches >= 2))
00534                         {
00535                                                         hashcounter_insert(hashCounter2, hashValue2);
00536                             sinceChange = 0;
00537                         }
00538                                         }
00539 
00540                     sinceChange++;
00541                     offset++;
00542                 }
00543                         }
00544 
00545             sequenceNumber++;
00546         }
00547 
00548         if (hashCounter1)
00549                 hashcounter_free(hashCounter1);
00550                 hashCounter1 = hashCounter2;
00551         seed = rand();
00552 
00553         if (iteration != numIterations - 1)
00554                 hashCounter2 = hashcounter_new(hashCounterSize, 0);
00555 
00556         printf("Iteration %d WordLength=%d\n", iteration, wordLength); fflush(stdout);
00557         hashcounter_count(hashCounter1, stdout); fflush(stdout);
00558 //        printf("Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout);
00559 
00560         time += clock();
00561         printf("Iteration time=%.2f secs\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00562 
00563         iteration++;
00564         wordLength += decoQantum;
00565         }
00566 
00567     // Temporary buffer for storing word locations
00568     wordLocations = (struct wordLocation*)global_malloc(sizeof(struct wordLocation) * numberOfSequences);
00569 
00570     // Initialize diagonals
00571     diagonals = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock)
00572                                                       * numberOfSequences);
00573         sequenceNumber = 0;
00574     while (sequenceNumber < numberOfSequences)
00575     {
00576         memSingleBlock_initializeExisting(diagonals + sequenceNumber, sizeof(struct diagonal), 1);
00577         sequenceNumber++;
00578     }
00579 
00580     printf("Initialized diagonals\n"); fflush(stdout);
00581 
00582     wordLength -= decoQantum;
00583     seed = rand();
00584 
00585     printf("Constructing and processing postings lists"); fflush(stdout);
00586     // Process collection a section at a time
00587     while (hashtablePass < numHashtablePasses)
00588     {
00589         time = -clock();
00590         postings_initialize();
00591 
00592         // For each sequence
00593         sequenceNumber = 0;
00594         while (sequenceNumber < numberOfSequences)
00595         {
00596             sequence = sequences[sequenceNumber].sequence;
00597             sequenceSize = sequences[sequenceNumber].length;
00598 
00599             // For each sequence long enough that hasn't been excluded
00600             if (sequenceSize >= wordLength)
00601             {
00602                 // Create initial passhash
00603                 passhash = 0;
00604                                 offset = 0;
00605                 while (offset < wordLength)
00606                 {
00607                                         passhash += sequence[offset];
00608                         offset++;
00609                 }
00610 
00611                 // For each word
00612                 offset = 0;
00613                 while (offset < sequenceSize - wordLength)
00614                 {
00615                         // If to be processed this pass
00616                     if (passhash % numHashtablePasses == hashtablePass)
00617                     {
00618                         // Insert into hash table
00619                         word = sequence + offset;
00620                         hashcounter_hash(hashCounter1, word, wordLength, hashValue1, position);
00621                         if (hashcounter_multiple(hashCounter1, hashValue1))
00622                         {
00623                                 postings_addEntry(sequence + offset, wordLength, sequenceNumber, offset);
00624                         }
00625                                         }
00626 
00627                     // Update passhash
00628                     passhash -= sequence[offset];
00629                     passhash += sequence[offset + wordLength];
00630                     offset++;
00631                 }
00632             }
00633 
00634             sequenceNumber++;
00635         }
00636 
00637         // Report the load on the hashtable
00638 //        printf("Hashtable pass %d\n", hashtablePass);
00639 //        postings_print();
00640 //        printf("Total malloced=%s\n", global_int4toString(global_totalMalloc));
00641                 time += clock();
00642 //              printf("Hashtable construction time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00643                 time = -clock();
00644 
00645         // Sort hash table entries in order of size
00646                 finalLists = postings_getSortedLists();
00647 
00648         // Start with the longest entry
00649         key = postings_numLists - 1;
00650         while (key >= 0)
00651         {
00652             // Get each entry in descending order of size
00653                         numWordLocations = postings_decodeList(finalLists[key], wordLocations);
00654 
00655             if (numWordLocations > maxListLength)
00656             {
00657 //                printf("Was %d\n", numWordLocations); fflush(stdout);
00658 
00659                 // Remove any locations refering to already clustered sequences
00660                 numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences);
00661 
00662                 if (numWordLocations >= maxListLength &&
00663                     !cluster_buildPSSM(wordLocations, numWordLocations, sequences))
00664                 {
00665                     while (numWordLocations >= maxListLength)
00666                     {
00667                         // Build cluster from longest list of word locations
00668                         cluster_buildCluster(wordLocations, numWordLocations, sequences);
00669 
00670                         //      cluster_printList(wordLocations, numWordLocations);
00671 
00672                         numWordLocations = cluster_removeChildren(wordLocations, numWordLocations, sequences);
00673 //                        printf("Now %d\n", numWordLocations); fflush(stdout);
00674                     }
00675 
00676                     // Free the PSSM
00677                     cluster_freePSSM();
00678                 }
00679             }
00680 
00681             // Process a list of word locations and insert/update diagonal entries
00682             if (numWordLocations > 1)
00683                 cluster_processList(diagonals, wordLocations, numWordLocations);
00684 
00685             key--;
00686         }
00687 
00688         free(finalLists);
00689 
00690                 time += clock();
00691 //              printf("Hashtable processing time=%f\n", (float)time / CLOCKS_PER_SEC); fflush(stdout);
00692                 printf("."); fflush(stdout);
00693 
00694         hashtablePass++;
00695     }
00696 
00697         printf("done.\n"); fflush(stdout);
00698     free(wordLocations);
00699 
00700     // Free hash counter
00701     hashcounter_free(hashCounter1);
00702 
00703     printf("Identifying high-scoring sequence pairs..."); fflush(stdout);
00704 
00705     sequenceMatches = memSingleBlock_initialize(sizeof(struct sequenceMatch), numberOfSequences);
00706 
00707     // For each sequence
00708     sequenceNumber = 0;
00709     while (sequenceNumber < numberOfSequences)
00710     {
00711         // Go through list of matching diagonals
00712         sequenceDiagonals = diagonals + sequenceNumber;
00713         memSingleBlock_resetCurrent(sequenceDiagonals);
00714         while ((diagonalMatch = memSingleBlock_getCurrent(sequenceDiagonals)) != NULL)
00715         {
00716                 sequence1 = sequenceNumber;
00717             sequence2 = diagonalMatch->matchSequence;
00718             relativeOffset = diagonalMatch->relativeOffset;
00719             seq1 = sequences + sequence1;
00720             seq2 = sequences + sequence2;
00721 
00722             // Calculate rough score based on number of matches, length of overlap region
00723             score = 100 * diagonalMatch->numMatches /
00724                     cluster_overlap(seq1, seq2, relativeOffset) + decoWordLength;
00725 
00726             if (score > percentMatchesRequired)// && relativeOffset == 0 && seq1->length == seq2->length)
00727             {
00728                 // Compare the two sequences in detail
00729                 score = cluster_score(seq1, seq2, relativeOffset, wildcardsThreshold);
00730                 if (score > 0)
00731                 {
00732 //                    printf("Sequences %d,%d lengths %d,%d offset %d matches=%d score=%d\n",
00733 //                           sequence1, sequence2, sequences[sequence1].length, sequences[sequence2].length,
00734 //                           relativeOffset, diagonalMatch->numMatches, score);
00735 
00736                     // Record high-scoring match between pair of sequences
00737                     sequenceMatch = memSingleBlock_newEntry(sequenceMatches);
00738                     sequenceMatch->sequence1 = sequence1;
00739                     sequenceMatch->sequence2 = sequence2;
00740                     sequenceMatch->relativeOffset = relativeOffset;
00741                     sequenceMatch->score = score;
00742                 }
00743             }
00744 
00745         }
00746         sequenceNumber++;
00747     }
00748 
00749     printf("done. (%d pairs)\n", sequenceMatches->numEntries); fflush(stdout);
00750 
00751     // Free memory used to store diagonal matches
00752         sequenceNumber = 0;
00753     while (sequenceNumber < numberOfSequences)
00754     {
00755         free(diagonals[sequenceNumber].block);
00756         sequenceNumber++;
00757     }
00758     free(diagonals);
00759 
00760     // Process sequence matches
00761     cluster_processSequenceMatches(sequenceMatches, sequences);
00762 
00763     memSingleBlock_free(sequenceMatches);
00764 }

Here is the call graph for this function:

Here is the caller graph for this function:

void cluster_updateWildcode ( struct parent parent,
uint4  position,
uint4  wildCode 
)

Definition at line 1692 of file cluster.c.

References global_realloc(), parent::numWildcards, sequence::parent, wildCode::position, uint4, wildCode::wildCode, and parent::wildCodes.

Referenced by cluster_addChild(), and cluster_mergeParents().

01693 {
01694         uint4 count;
01695 
01696         count = 0;
01697     while (count < parent->numWildcards)
01698     {
01699                 if (parent->wildCodes[count].position == position)
01700         {
01701                 parent->wildCodes[count].wildCode = wildCode;
01702                 return;
01703                 }
01704         count++;
01705     }
01706 
01707     // If position not found, add wildcode
01708         parent->numWildcards++;
01709         parent->wildCodes = (struct wildCode*)global_realloc(parent->wildCodes, parent->numWildcards
01710                       * sizeof(struct wildCode));
01711 
01712     parent->wildCodes[parent->numWildcards - 1].position = position;
01713     parent->wildCodes[parent->numWildcards - 1].wildCode = wildCode;
01714 }

Here is the call graph for this function:

Here is the caller graph for this function:

void cluster_updateWildcodes ( struct parent parent,
int4  change 
)

Definition at line 1665 of file cluster.c.

References sequence::parent, wildCode::position, uint4, and parent::wildCodes.

Referenced by cluster_addChild().

01666 {
01667         uint4 count = 0;
01668     while (count < parent->numWildcards)
01669     {
01670                 parent->wildCodes[count].position += change;
01671         count++;
01672     }
01673 }

Here is the caller graph for this function:

void cluster_writeClusters ( char *  filename,
struct sequence sequences 
)

Definition at line 252 of file cluster.c.

References parent::children, cluster_getEdits(), cluster_parents, wild::code, child::description, child::descriptionLength, sequence::descriptionLength, sequence::descriptionLocation, descriptions_getDescription(), child::edits, global_malloc(), sequence::length, child::length, parent::length, memBlocks_getCurrent(), memBlocks_resetCurrent(), parent::numChildren, child::numEdits, readdb_dbAlphabetType, readdb_longestSequenceLength, readdb_numberOfSequences, sequence::regionStart, child::regionStart, sequence::sequence, parent::sequence, uint4, wildcards_clusterWildcards, wildcards_countOccurences(), wildcards_getOccurences(), wildcards_initializeCountOccurences(), wildcards_numClusterWildcards, wildcards_outputWildcards(), wildcards_scoreCandidates(), clusterWildcard::wildCode, parent::wildCodes, writedb_addSequence(), writedb_close(), writedb_dataFilename, writedb_descriptionsFilename, writedb_initialize(), and writedb_sequenceFilename.

Referenced by main().

00253 {
00254     struct parent* parent;
00255         char *newFilename, *wildcardsOutFilename, *renamedFilename;
00256     uint4 count, sequenceNumber, sequenceLength, descriptionLength, descriptionLocation;
00257         unsigned char *sequence, *description;
00258     uint4 childNum, numChildren, descriptionTotalLength, numWilds;
00259         struct child* children;
00260         struct wild* wilds, *wildcards;
00261 
00262     printf("Writing clusters to disk..."); fflush(stdout);
00263 
00264         // Construct sequence and description filenames
00265         newFilename = (char*)global_malloc(strlen(filename) + 12);
00266         sprintf(newFilename, "%s_clustered", filename);
00267 
00268     // Initialize writing to formatted database
00269     writedb_initialize(newFilename, readdb_dbAlphabetType);
00270 
00271     // Initialize wildcard occurence counting
00272     wildcards_initializeCountOccurences(readdb_longestSequenceLength);
00273 
00274     // For each parent
00275     memBlocks_resetCurrent(cluster_parents);
00276         while ((parent = memBlocks_getCurrent(cluster_parents)) != NULL)
00277     {
00278         if (parent->children != NULL && parent->numChildren > 1)
00279         {
00280             sequenceLength = parent->length;
00281             sequence = parent->sequence;
00282             numChildren = parent->numChildren;
00283 
00284             children = (struct child*)global_malloc(sizeof(struct child) * numChildren);
00285 
00286             // For each child
00287             childNum = 0;
00288             descriptionTotalLength = 0;
00289             while (childNum < numChildren)
00290             {
00291                 // Load description from old collection
00292                 descriptionLocation = parent->children[childNum]->descriptionLocation;
00293                 descriptionLength = parent->children[childNum]->descriptionLength;
00294                 description = descriptions_getDescription(descriptionLocation, descriptionLength);
00295 //                description = parent->children[childNum]->description;
00296 
00297                                 children[childNum].description = description;
00298                                 children[childNum].descriptionLength = descriptionLength;
00299                                 children[childNum].regionStart = parent->children[childNum]->regionStart;
00300                 children[childNum].length = parent->children[childNum]->length;
00301                 children[childNum].edits = cluster_getEdits(parent, parent->children[childNum],
00302                                                             &(children[childNum].numEdits));
00303                 childNum++;
00304             }
00305 
00306             // Add sequence to the formatted collection
00307             writedb_addSequence(sequence, sequenceLength, NULL, 0, NULL,
00308                                 0, children, numChildren);
00309 
00310                         // Count final wildcard occurences
00311             wildcards_countOccurences(children, numChildren, sequenceLength);
00312 
00313             // Free memory used for children and edits
00314             childNum = 0;
00315             while (childNum < numChildren)
00316             {
00317                 free(children[childNum].edits);
00318                 free(children[childNum].description);
00319                 childNum++;
00320             }
00321             free(children);
00322 
00323             // Print
00324 //            if (numChildren > 1)
00325 //                      cluster_printCluster(parent);
00326                 }
00327 
00328                 // Free the parent
00329         free(parent->sequence);
00330         free(parent->children);
00331         free(parent->wildCodes);
00332     }
00333 
00334     printf("done.\nWriting remaining sequences to disk..."); fflush(stdout);
00335 
00336     // For each sequence without a parent or only child
00337     sequenceNumber = 0;
00338         while (sequenceNumber < readdb_numberOfSequences)
00339         {
00340         if (sequences[sequenceNumber].parent == NULL ||
00341             sequences[sequenceNumber].parent->numChildren == 1)
00342         {
00343             sequenceLength = sequences[sequenceNumber].length;
00344             sequence = sequences[sequenceNumber].sequence;
00345             descriptionLength = sequences[sequenceNumber].descriptionLength;
00346             descriptionLocation = sequences[sequenceNumber].descriptionLocation;
00347 
00348             // Load description from old collection
00349 //            description = sequences[sequenceNumber].description;
00350             description = descriptions_getDescription(descriptionLocation, descriptionLength);
00351 
00352             // Add sequence to the formatted collection
00353             writedb_addSequence(sequence, sequenceLength, description,
00354                                 descriptionLength, NULL, 0, NULL, 0);
00355 
00356                         free(description);
00357                 }
00358 
00359         // Free the sequence
00360 //        free(sequences[sequenceNumber].sequence);
00361 
00362         sequenceNumber++;
00363         }
00364 
00365     printf("done.\n"); fflush(stdout);
00366 
00367     // Get list of wild occurences
00368         wilds = wildcards_getOccurences(&numWilds);
00369 
00370     // Score the wildcards using occurences
00371     wildcards = (struct wild*)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); 
00372     count = 0;
00373     while (count < wildcards_numClusterWildcards)
00374     {
00375                 wildcards[count].code = wildcards_clusterWildcards[count].wildCode;
00376         count++;
00377         }
00378         wildcards_scoreCandidates(wildcards, wildcards_numClusterWildcards, wilds, numWilds, 0);
00379     free(wilds);
00380 
00381     // Output wildcards scoring info
00382     wildcardsOutFilename = (char*)global_malloc(strlen(newFilename) + 12);
00383         sprintf(wildcardsOutFilename, "%s.wildcards", newFilename);
00384     wildcards_outputWildcards(wildcardsOutFilename);
00385 
00386     // Finalize writing to the new formatted collection
00387     writedb_close();
00388 
00389     // Rename clustered collection
00390     renamedFilename = (char*)global_malloc(strlen(filename) + 20);
00391         sprintf(renamedFilename, "%s.wildcards", filename);
00392     if (rename(wildcardsOutFilename, renamedFilename))
00393         fprintf(stderr, "Error renaming file %s to %s\n", wildcardsOutFilename, renamedFilename);
00394 
00395         sprintf(renamedFilename, "%s.sequences", filename);
00396     if (rename(writedb_sequenceFilename, renamedFilename))
00397         fprintf(stderr, "Error renaming file %s to %s\n", writedb_sequenceFilename, renamedFilename);
00398 
00399         sprintf(renamedFilename, "%s.descriptions", filename);
00400     if (rename(writedb_descriptionsFilename, renamedFilename))
00401         fprintf(stderr, "Error renaming file %s to %s\n", writedb_descriptionsFilename, renamedFilename);
00402 
00403         sprintf(renamedFilename, "%s.data", filename);
00404     if (rename(writedb_dataFilename, renamedFilename))
00405         fprintf(stderr, "Error renaming file %s to %s\n", writedb_dataFilename, renamedFilename);
00406 }

Here is the call graph for this function:

Here is the caller graph for this function:

int4 main ( int4  argc,
char *  argv[] 
)

Definition at line 135 of file cluster.c.

References scoreMatrix::averageMatchScore, cluster_averageMatchScore, cluster_averageWildcodeScores, cluster_calculateSavings(), cluster_parents, cluster_spexClusterSequences(), cluster_writeClusters(), sequence::descriptionLength, sequence::descriptionLocation, encoding_alphabetTypes, encoding_free(), encoding_initialize(), encoding_protein, global_free(), global_malloc(), sequence::length, memBlocks_free(), memBlocks_initialize(), sequence::number, parameters_findScoringMatrix(), parameters_free(), parameters_scoringMatrixPath, sequence::parent, readdb_close(), readdb_dbAlphabetType, readdb_longestSequenceLength, readdb_numberOfClusters, readdb_numberOfLetters, readdb_numberOfSequences, readdb_open(), readdb_readSequence(), scoreMatrix_free(), scoreMatrix_load(), sequence::sequence, uint4, wildcards_initialize(), wildcards_readWildcards(), and wildcards_scoreMatrix.

00136 {
00137         unsigned char *filename, *sequence;
00138         uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
00139         uint4 sequenceNumber = 0, encodedLength;
00140         struct sequence* sequences;
00141 
00142         // User must provide FASTA format file at command line
00143         if (argc < 2)
00144         {
00145                 fprintf(stderr, "Useage: cluster <Database>\n");
00146                 exit(-1);
00147         }
00148         filename = argv[1];
00149 
00150     // Open formatted file for reading
00151     readdb_open(filename);
00152 
00153         printf("Number of sequences = %u\n", readdb_numberOfSequences);
00154         printf("Total number of letters = %llu\n", readdb_numberOfLetters);
00155         printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
00156         printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);
00157 
00158     if (readdb_dbAlphabetType != encoding_protein)
00159     {
00160         fprintf(stderr, "Error: Only protein collection clustering currently supported\n");
00161                 exit(-1);
00162     }
00163 
00164     if (readdb_numberOfSequences != readdb_numberOfClusters)
00165     {
00166         fprintf(stderr, "Error clustering %s: collection has already been clustered\n", filename);
00167         exit(-1);
00168     }
00169 
00170     sequences = (struct sequence*)global_malloc(sizeof(struct sequence) * readdb_numberOfSequences);
00171 
00172         // Initialize codes array
00173         encoding_initialize(readdb_dbAlphabetType);
00174 
00175     // Initialize cluster wildcards
00176         wildcards_initialize();
00177 
00178     if (argc == 5)
00179     {
00180         // Read in a set of wildcards
00181         wildcards_readWildcards(argv[4]);
00182     }
00183 
00184     // Load score matrix
00185     parameters_findScoringMatrix();
00186     wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);
00187 
00188     // Calculate average score for matching two residues
00189     cluster_averageMatchScore = wildcards_scoreMatrix.averageMatchScore;
00190 
00191     // Move through index file reading sequence codes and converting back to ASCII sequences
00192     sequenceNumber = 0;
00193         while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
00194                                &descriptionLength, &encodedLength))
00195         {
00196         // Record sequence, sequence length, number, no parent
00197         sequences[sequenceNumber].sequence = sequence;
00198         sequences[sequenceNumber].length = sequenceLength;
00199         sequences[sequenceNumber].number = sequenceNumber;
00200         sequences[sequenceNumber].parent = NULL;
00201         sequences[sequenceNumber].descriptionLength = descriptionLength;
00202         sequences[sequenceNumber].descriptionLocation = descriptionStart;
00203 
00204                 sequenceNumber++;
00205         }
00206 
00207     if (sequenceNumber != readdb_numberOfSequences)
00208     {
00209         fprintf(stderr, "Error: only %d sequences read\n", sequenceNumber);
00210         exit(-1);
00211     }
00212 
00213     printf("%d sequences read.\n", sequenceNumber);
00214         fflush(stdout);
00215 
00216     // Initialize array for storing parents
00217         cluster_parents = memBlocks_initialize(sizeof(struct parent), 10000);
00218 
00219     // Simple approach to clustering sequences
00220 //    cluster_simpleClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters);
00221 
00222         // Perform SPEX algorithm and cluster sequences
00223     cluster_spexClusterSequences(sequences, sequenceNumber, readdb_numberOfLetters);
00224 
00225     // Cluster sequences in the collection
00226 //    cluster_clusterSequences(sequences, sequenceNumber, readdb_numberOfLetters, filename);
00227 
00228     printf("Total bytes saved=%d\n", cluster_calculateSavings()); fflush(stdout);
00229 
00230 //    printf("END Total malloced=%s\n", global_int4toString(global_totalMalloc)); fflush(stdout);
00231 
00232     // Load sequence descriptions from disk
00233 //    cluster_loadDescriptions(sequences);
00234 
00235         // Write clusters to disk
00236     cluster_writeClusters(filename, sequences);
00237 
00238     // Close reading formatted database
00239     readdb_close();
00240 
00241     free(cluster_averageWildcodeScores);
00242     free(sequences);
00243     memBlocks_free(cluster_parents);
00244     encoding_free();
00245     global_free();
00246     scoreMatrix_free(wildcards_scoreMatrix);
00247     parameters_free();
00248     return 0;
00249 }

Here is the call graph for this function:


Variable Documentation

float cluster_averageMatchScore = 0

Definition at line 23 of file cluster.c.

Referenced by cluster_score(), and main().

float* cluster_averageWildcodeScores = NULL

Definition at line 24 of file cluster.c.

Referenced by cluster_averageWildcodeScore(), and main().

uint4 cluster_numMergeDiagonals

Definition at line 70 of file cluster.c.

uint4 cluster_numParents = 0

Definition at line 72 of file cluster.c.

Referenced by cluster_newParent().

struct memBlocks* cluster_parents

Definition at line 71 of file cluster.c.

Referenced by cluster_buildCluster(), cluster_calculateSavings(), cluster_processSequenceMatches(), cluster_writeClusters(), and main().

uint4** cluster_PSSM

Definition at line 73 of file cluster.c.

Referenced by cluster_buildCluster(), cluster_buildPSSM(), and cluster_freePSSM().

int4 cluster_PSSMend

Definition at line 74 of file cluster.c.

Referenced by cluster_buildPSSM(), and cluster_freePSSM().

int4 cluster_PSSMlength

Definition at line 74 of file cluster.c.

Referenced by cluster_buildPSSM().

int4 cluster_PSSMstart

Definition at line 74 of file cluster.c.

Referenced by cluster_buildPSSM(), and cluster_freePSSM().

float wildcardsThreshold = 0.25

Definition at line 69 of file cluster.c.

Referenced by cluster_buildCluster(), cluster_clusterSequences(), cluster_processSequenceMatches(), cluster_simpleClusterSequences(), cluster_spexClusterSequences(), and rsdb_spexClusterSequences().


Generated on Wed Dec 19 20:54:47 2007 for fsa-blast by  doxygen 1.5.2