lib/pslTransMap.c

Go to the documentation of this file.
00001 /* pslTransMap - transitive mapping of an alignment another sequence, via a
00002  * common alignment */
00003 #include "common.h"
00004 #include "pslTransMap.h"
00005 #include "psl.h"
00006 
00007 /*
00008  * Notes:
00009  *  - This code is used with both large and small mapping psls.  Large
00010  *    psls used for doing cross-species mappings and small psl are used for
00011  *    doing protein to mRNA mappings.  This introduces some speed issues.  For
00012  *    chain mapping, a large amount of time is spent in getBlockMapping()
00013  *    searching for the starting point of a mapping.  However an optimization
00014  *    to find the starting point, such as a binKeeper, could be inefficient
00015  *    for a large number of small psls.  Implementing such an optimzation
00016  *    would have to be dependent on the type of mapping.  The code was made
00017  *    16x faster for genome mappings by remembering the current location in
00018  *    the mapping psl between blocks (iMapBlkPtr arg).  This will do for a
00019  *    while.
00020  */
00021 
00022 
00023 struct block
00024 /* coordinates of a block */
00025 {
00026     int qStart;          /* Query start position. */
00027     int qEnd;            /* Query end position. */
00028     int tStart;          /* Target start position. */
00029     int tEnd;            /* Target end position. */
00030 };
00031 
00032 static void pslProtToNA(struct psl *psl)
00033 /* convert a protein/NA alignment to a NA/NA alignment */
00034 {
00035 int iBlk;
00036 
00037 psl->qStart *= 3;
00038 psl->qEnd *= 3;
00039 psl->qSize *= 3;
00040 for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
00041     {
00042     psl->blockSizes[iBlk] *= 3;
00043     psl->qStarts[iBlk] *= 3;
00044     }
00045 }
00046 
00047 static void pslNAToProt(struct psl *psl)
00048 /* undo pslProtToNA */
00049 {
00050 int iBlk;
00051 
00052 psl->qStart /= 3;
00053 psl->qEnd /= 3;
00054 psl->qSize /= 3;
00055 for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
00056     {
00057     psl->blockSizes[iBlk] /= 3;
00058     psl->qStarts[iBlk] /= 3;
00059     }
00060 }
00061 
00062 static struct psl* createMappedPsl(struct psl* inPsl, struct psl *mapPsl,
00063                                    int mappedPslMax)
00064 /* setup a PSL for the output alignment */
00065 {
00066 char strand[3];
00067 assert(pslTStrand(inPsl) == pslQStrand(mapPsl));
00068 
00069 /* strand can be taken from both alignments, since common sequence is in same
00070  * orientation. */
00071 strand[0] = pslQStrand(inPsl);
00072 strand[1] = pslTStrand(mapPsl);
00073 strand[2] = '\n';
00074 
00075 return pslNew(inPsl->qName, inPsl->qSize, 0, 0,
00076               mapPsl->tName, mapPsl->tSize, 0, 0,
00077               strand, mappedPslMax, 0);
00078 }
00079 
00080 static struct block blockFromPslBlock(struct psl* psl, int iBlock)
00081 /* fill in a block object from a psl block */
00082 {
00083 struct block block;
00084 block.qStart = psl->qStarts[iBlock];
00085 block.qEnd = psl->qStarts[iBlock] + psl->blockSizes[iBlock];
00086 block.tStart = psl->tStarts[iBlock];
00087 block.tEnd = psl->tStarts[iBlock] + psl->blockSizes[iBlock];
00088 return block;
00089 }
00090 
00091 static void addPslBlock(struct psl* psl, struct block* blk, int* pslMax)
00092 /* add a block to a psl */
00093 {
00094 unsigned newIBlk = psl->blockCount;
00095 
00096 assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
00097 if (newIBlk >= *pslMax)
00098     pslGrow(psl, pslMax);
00099 psl->qStarts[newIBlk] = blk->qStart;
00100 psl->tStarts[newIBlk] = blk->tStart;
00101 psl->blockSizes[newIBlk] = blk->qEnd - blk->qStart;
00102 /* lie about match counts. */
00103 psl->match += psl->blockSizes[newIBlk];
00104 psl->blockCount++;
00105 }
00106 
00107 static void setPslBounds(struct psl* mappedPsl)
00108 /* set sequences bounds on mapped PSL */
00109 {
00110 int lastBlk = mappedPsl->blockCount-1;
00111 
00112 /* set start/end of sequences */
00113 mappedPsl->qStart = mappedPsl->qStarts[0];
00114 mappedPsl->qEnd = mappedPsl->qStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
00115 if (pslQStrand(mappedPsl) == '-')
00116     reverseIntRange(&mappedPsl->qStart, &mappedPsl->qEnd, mappedPsl->qSize);
00117 
00118 mappedPsl->tStart = mappedPsl->tStarts[0];
00119 mappedPsl->tEnd = mappedPsl->tStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
00120 if (pslTStrand(mappedPsl) == '-')
00121     reverseIntRange(&mappedPsl->tStart, &mappedPsl->tEnd, mappedPsl->tSize);
00122 }
00123 
00124 static void adjustOrientation(unsigned opts, struct psl *inPsl, char *inPslOrigStrand,
00125                               struct psl* mappedPsl)
00126 /* adjust strand, possibly reverse complementing, based on
00127  * pslTransMapKeepTrans option and input psls. */
00128 {
00129 if (!(opts&pslTransMapKeepTrans) || ((inPslOrigStrand[1] == '\0') && (mappedPsl->strand[1] == '\0')))
00130     {
00131     /* make untranslated */
00132     if (pslTStrand(mappedPsl) == '-')
00133         pslRc(mappedPsl);
00134     mappedPsl->strand[1] = '\0';  /* implied target strand */
00135     }
00136 }
00137 
00138 static struct block getBeforeBlockMapping(unsigned mqStart, unsigned mqEnd,
00139                                           struct block* align1Blk)
00140 /* map part of an ungapped psl block that occurs before a mapPsl block */
00141 {
00142 struct block mappedBlk;
00143 
00144 /* mRNA start in genomic gap before this block, this will
00145  * be an inPsl insert */
00146 unsigned size = (align1Blk->tEnd < mqStart)
00147     ? (align1Blk->tEnd - align1Blk->tStart)
00148     : (mqStart - align1Blk->tStart);
00149 ZeroVar(&mappedBlk);
00150 mappedBlk.qStart = align1Blk->qStart;
00151 mappedBlk.qEnd = align1Blk->qStart + size;
00152 return mappedBlk;
00153 }
00154 
00155 static struct block getOverBlockMapping(unsigned mqStart, unsigned mqEnd,
00156                                         unsigned mtStart, struct block* align1Blk)
00157 /* map part of an ungapped psl block that overlapps a mapPsl block. */
00158 {
00159 struct block mappedBlk;
00160 
00161 /* common sequence start contained in this block, this handles aligned
00162  * and genomic inserts */
00163 unsigned off = align1Blk->tStart - mqStart;
00164 unsigned size = (align1Blk->tEnd > mqEnd)
00165     ? (mqEnd - align1Blk->tStart)
00166     : (align1Blk->tEnd - align1Blk->tStart);
00167 ZeroVar(&mappedBlk);
00168 mappedBlk.qStart = align1Blk->qStart;
00169 mappedBlk.qEnd = align1Blk->qStart + size;
00170 mappedBlk.tStart = mtStart + off;
00171 mappedBlk.tEnd = mtStart + off + size;
00172 return mappedBlk;
00173 }
00174 
00175 static struct block getBlockMapping(struct psl* inPsl, struct psl *mapPsl,
00176                                     int *iMapBlkPtr, struct block* align1Blk)
00177 /* Map part or all of a ungapped psl block to the genome.  This returns the
00178  * coordinates of the sub-block starting at the beginning of the align1Blk.
00179  * If this is a gap, either the target or query coords are zero.  This works
00180  * in genomic strand space.  The search starts at the specified map block,
00181  * which is updated to prevent rescanning large psls.
00182  */
00183 {
00184 int iBlk;
00185 struct block mappedBlk;
00186 
00187 /* search for block or gap containing start of mrna block */
00188 for (iBlk = *iMapBlkPtr; iBlk < mapPsl->blockCount; iBlk++)
00189     {
00190     unsigned mqStart = mapPsl->qStarts[iBlk];
00191     unsigned mqEnd = mapPsl->qStarts[iBlk]+mapPsl->blockSizes[iBlk];
00192     if (align1Blk->tStart < mqStart)
00193         {
00194         *iMapBlkPtr = iBlk;
00195         return getBeforeBlockMapping(mqStart, mqEnd, align1Blk);
00196         }
00197     if ((align1Blk->tStart >= mqStart) && (align1Blk->tStart < mqEnd))
00198         {
00199         *iMapBlkPtr = iBlk;
00200         return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk);
00201         }
00202     }
00203 
00204 /* reached the end of the mRNA->genome alignment, finish off the 
00205  * rest of the the protein as an insert */
00206 ZeroVar(&mappedBlk);
00207 mappedBlk.qStart = align1Blk->qStart;
00208 mappedBlk.qEnd = align1Blk->qEnd;
00209 *iMapBlkPtr = iBlk;
00210 return mappedBlk;
00211 }
00212 
00213 static boolean mapBlock(struct psl *inPsl, struct psl *mapPsl, int *iMapBlkPtr,
00214                         struct block *align1Blk, struct psl* mappedPsl,
00215                         int* mappedPslMax)
00216 /* Add a PSL block from a ungapped portion of an alignment, mapping it to the
00217  * genome.  If the started of the inPsl block maps to a part of the mapPsl
00218  * that is aligned, it is added as a PSL block, otherwise the gap is skipped.
00219  * Block starts are adjusted for next call.  Return FALSE if the end of the
00220  * alignment is reached. */
00221 {
00222 struct block mappedBlk;
00223 unsigned size;
00224 assert(align1Blk->qStart <= align1Blk->qEnd);
00225 assert(align1Blk->tStart <= align1Blk->tEnd);
00226 assert((align1Blk->qEnd - align1Blk->qStart) == (align1Blk->tEnd - align1Blk->tStart));
00227 
00228 if ((align1Blk->qStart >= align1Blk->qEnd) || (align1Blk->tStart >= align1Blk->tEnd))
00229     return FALSE;  /* end of ungapped block. */
00230 
00231 /* find block or gap with start coordinates of mrna */
00232 mappedBlk = getBlockMapping(inPsl, mapPsl, iMapBlkPtr, align1Blk);
00233 
00234 if ((mappedBlk.qEnd != 0) && (mappedBlk.tEnd != 0))
00235     addPslBlock(mappedPsl, &mappedBlk, mappedPslMax);
00236 
00237 /* advance past block or gap */
00238 size = (mappedBlk.qEnd != 0)
00239     ? (mappedBlk.qEnd - mappedBlk.qStart)
00240     : (mappedBlk.tEnd - mappedBlk.tStart);
00241 align1Blk->qStart += size;
00242 align1Blk->tStart += size;
00243 
00244 return TRUE;
00245 }
00246 
00247 struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl)
00248 /* map a psl via a mapping psl, a single psl is returned, or NULL if it
00249  * couldn't be mapped. */
00250 {
00251 int mappedPslMax = 8; /* allocated space in output psl */
00252 int iMapBlk = 0;
00253 char inPslOrigStrand[3];
00254 boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
00255 boolean cnv1 = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl));
00256 boolean cnv2 = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl));
00257 int iBlock;
00258 struct psl* mappedPsl;
00259 
00260 if (!sameString(inPsl->tName, mapPsl->qName))
00261     errAbort("Error: inPsl tName %s != mapPsl tName %s)",
00262             inPsl->tName, mapPsl->qName);
00263 if (inPsl->tSize != mapPsl->qSize)
00264     errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
00265             inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);
00266 
00267 /* convert protein PSLs */
00268 if (cnv1)
00269     pslProtToNA(inPsl);
00270 if (cnv2)
00271     pslProtToNA(mapPsl);
00272 
00273 /* need to ensure common sequence is in same orientation, save strand for later */
00274 safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
00275 if (rcInPsl)
00276     pslRc(inPsl);
00277 
00278 mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
00279 
00280 /* Fill in ungapped blocks.  */
00281 for (iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
00282     {
00283     struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
00284     while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
00285                     &mappedPslMax))
00286         continue;
00287     }
00288 
00289 /* finish up psl, or free if no blocks were added */
00290 assert(mappedPsl->blockCount <= mappedPslMax);
00291 if (mappedPsl->blockCount == 0)
00292     pslFree(&mappedPsl);  /* nothing made it */
00293 else
00294     {
00295     setPslBounds(mappedPsl);
00296     adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl);
00297     }
00298 
00299 /* restore input */
00300 if (rcInPsl)
00301     {
00302     pslRc(inPsl);
00303     strcpy(inPsl->strand, inPslOrigStrand);
00304     }
00305 if (cnv1)
00306     pslNAToProt(inPsl);
00307 if (cnv2)
00308     pslNAToProt(mapPsl);
00309 
00310 return mappedPsl;
00311 }
00312 

Generated on Tue Dec 25 18:39:31 2007 for blat by  doxygen 1.5.2