include/dust.h File Reference

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  chunk
struct  maskRegion

Functions

void dust_dustSequence (char *originalSequence)
void dust_processChunk (int windowLength, unsigned char *sequence, int chunkStart, struct chunk *chunk)
void dust_processWindow (int windowLength, int windowStart, struct chunk *chunk, unsigned char *sequence)


Function Documentation

void dust_dustSequence ( char *  originalSequence  ) 

Definition at line 12 of file dust.c.

References dust_processWindow(), encoding_encodeSequence(), encoding_nucleotide, encoding_numRegularLetters, encoding_randomEncodedLetter(), chunk::end, maskRegion::from, global_malloc(), chunk::score, chunk::start, and maskRegion::to.

Referenced by main().

00013 {
00014         int windowStart;
00015         unsigned char *sequence;
00016     int sequenceLength, windowLength, count;
00017         int cutoffScore = 20;
00018         int windowSize = 64;
00019         int minimumRegionSize = 4;
00020         int linker = 1;
00021         int windowhalf = windowSize / 2;
00022         struct chunk chunk;
00023         struct maskRegion *regions = NULL, *currentRegion = NULL;
00024 
00025     sequenceLength = strlen(originalSequence);
00026 
00027     sequence = (unsigned char*)global_malloc(sequenceLength + 1);
00028         strcpy(sequence, originalSequence);
00029 
00030     // Convert sequence into encoded format
00031         encoding_encodeSequence(sequence, sequenceLength, encoding_nucleotide);
00032 
00033         // Replace wildcards in the sequence
00034     count = 0;
00035     while (count < sequenceLength - 2)
00036     {
00037         // If a wild
00038         if (sequence[count] >= encoding_numRegularLetters)
00039         {
00040             // Code replacement
00041             sequence[count] = encoding_randomEncodedLetter(sequence[count]);
00042         }
00043         count++;
00044         }
00045 
00046     // Convert sequence into encoded triplets
00047         count = 0;
00048     while (count < sequenceLength - 2)
00049     {
00050         // Encode triplet
00051         sequence[count] = (sequence[count] << 4) | (sequence[count + 1] << 2) | sequence[count + 2];
00052 
00053         count++;
00054         }
00055 
00056     // Slide a window along the sequence
00057         for (windowStart = 0; windowStart < sequenceLength; windowStart += windowhalf)
00058         {
00059                 windowLength = (int)((sequenceLength > windowStart+windowSize) ? windowSize : sequenceLength - windowStart);
00060                 windowLength -= 2;
00061 
00062 //        printf("process window (length=%d, position=%d)\n", windowLength, windowStart); fflush(stdout);
00063                 dust_processWindow(windowLength, windowStart, &chunk, sequence);
00064 
00065 //        printf("Chunk start=%d end=%d score=%d\n", chunk.start, chunk.end, chunk.score);
00066 //      fflush(stdout);
00067 
00068         // Ignore chunks that are smaller than the minimum
00069                 if ((chunk.end - chunk.start + 1) < minimumRegionSize)
00070                 {
00071                         continue;
00072                 }
00073 
00074                 if (chunk.score > cutoffScore)
00075                 {
00076                 // If this region can be linked to previous (they are close to each other)
00077                         if (regions && regions->to + linker >= chunk.start + windowStart &&
00078                             regions->from <= chunk.start + windowStart)
00079                         {
00080                 // Extend previous region
00081                                 regions->to = chunk.end + windowStart;
00082                         }
00083                         else
00084                         {
00085                 // Add new region to start of linked list
00086                                 currentRegion = (struct maskRegion*)global_malloc(sizeof(struct maskRegion));
00087                                 currentRegion->from = chunk.start + windowStart;
00088                                 currentRegion->to = chunk.end + windowStart;
00089                 currentRegion->next = regions;
00090                 regions = currentRegion;
00091                         }
00092                         if (chunk.end < windowhalf)
00093                         {
00094                 // Advance next window to end of chunk
00095                                 windowStart += (chunk.end - windowhalf);
00096                         }
00097                 }
00098         }
00099 
00100     free(sequence);
00101 
00102     // For each region
00103     currentRegion = regions;
00104     while (currentRegion != NULL)
00105     {
00106         // Mask it using N's
00107         count = currentRegion->from;
00108         while (count <= currentRegion->to)
00109         {
00110                 originalSequence[count] = 'n';
00111                 count++;
00112         }
00113 
00114 //        printf("Start=%d End=%d\n", currentRegion->from, currentRegion->to);
00115         currentRegion = currentRegion->next;
00116     }
00117 }

Here is the call graph for this function:

Here is the caller graph for this function:

void dust_processChunk ( int  windowLength,
unsigned char *  sequence,
int  chunkStart,
struct chunk chunk 
)

Definition at line 144 of file dust.c.

References chunk::end, chunk::score, and chunk::start.

Referenced by dust_processWindow().

00145 {
00146     unsigned int sum;
00147         int position, triplet, numOccurrences;
00148         int newScore;
00149         int occurrences[256];
00150 
00151         // Initialize triplet occurrences to zero
00152     triplet = 0;
00153     while (triplet < 64)
00154     {
00155         occurrences[triplet] = 0;
00156         triplet++;
00157     }
00158 
00159     sum = 0;
00160         newScore = 0;
00161 
00162         // For each triplet in the sequence
00163         for (position = 0; position < windowLength; position++)
00164         {
00165         if (*sequence != 255)
00166         {
00167             // Increment counter of its occurance
00168             numOccurrences = occurrences[*sequence];
00169 
00170             // If it has occured more than one
00171             if (numOccurrences)
00172             {
00173                 // Calculate score
00174                 sum += numOccurrences;
00175                 newScore = 10 * sum / position;
00176 
00177                 // If the best score yet
00178                 if (newScore > chunk->score)
00179                 {
00180                     // Record the start and end of this high-scoring region
00181                     chunk->score = newScore;
00182                     chunk->start = chunkStart;
00183                     chunk->end = position + 2;
00184                 }
00185             }
00186             occurrences[*sequence]++;
00187                 }
00188         sequence++;
00189 //        printf("[%d]", *occurrencesptr);
00190         }
00191 }

Here is the caller graph for this function:

void dust_processWindow ( int  windowLength,
int  windowStart,
struct chunk chunk,
unsigned char *  sequence 
)

Definition at line 120 of file dust.c.

References dust_processChunk(), chunk::end, chunk::score, and chunk::start.

Referenced by dust_dustSequence().

00121 {
00122         int chunkStart;
00123 
00124     // Initialize best chunk
00125         chunk->score = 0;
00126         chunk->start = 0;
00127         chunk->end = 0;
00128 
00129     // Get window of the sequence
00130     sequence += windowStart;
00131 
00132         // Perform dust on each chunk in the window
00133         for (chunkStart = 0; chunkStart < windowLength; chunkStart++)
00134         {
00135 //      printf("wo1 (%d,%d)\n", windowLength-i, i);
00136                 dust_processChunk(windowLength - chunkStart, sequence + chunkStart, chunkStart, chunk);
00137         }
00138 
00139         // Update chunk end
00140         chunk->end += chunk->start;
00141 }

Here is the call graph for this function:

Here is the caller graph for this function:


Generated on Wed Dec 19 20:48:55 2007 for fsa-blast by  doxygen 1.5.2