lib/psl.c

Go to the documentation of this file.
00001 /* psl.c was originally generated by the autoSql program, which also 
00002  * generated as_psl.h and as_psl.sql.  This module links the database and the RAM 
00003  * representation of objects. 
00004  *
00005  * This file is copyright 2002 Jim Kent, but license is hereby
00006  * granted for all use - public, private or commercial. */
00007 
00008 #include "common.h"
00009 #include "sqlNum.h"
00010 #include "sqlList.h"
00011 #include "localmem.h"
00012 #include "psl.h"
00013 #include "hash.h"
00014 #include "linefile.h"
00015 #include "dnaseq.h"
00016 #include "dystring.h"
00017 #include "fuzzyFind.h"
00018 #include "aliType.h"
00019 #include "binRange.h"
00020 #include "rangeTree.h"
00021 
00022 static char const rcsid[] = "$Id: psl.c,v 1.76 2007/03/04 20:24:19 kent Exp $";
00023 
00024 static char *createString = 
00025 "CREATE TABLE %s (\n"
00026     "%s"                                /* Optional bin */
00027     "matches int unsigned not null,     # Number of bases that match that aren't repeats\n"
00028     "misMatches int unsigned not null,  # Number of bases that don't match\n"
00029     "repMatches int unsigned not null,  # Number of bases that match but are part of repeats\n"
00030     "nCount int unsigned not null,      # Number of 'N' bases\n"
00031     "qNumInsert int unsigned not null,  # Number of inserts in query\n"
00032     "qBaseInsert int unsigned not null, # Number of bases inserted in query\n"
00033     "tNumInsert int unsigned not null,  # Number of inserts in target\n"
00034     "tBaseInsert int unsigned not null, # Number of bases inserted in target\n"
00035     "strand char(2) not null,   # + or - for strand.  First character is query, second is target.\n"
00036     "qName varchar(255) not null,       # Query sequence name\n"
00037     "qSize int unsigned not null,       # Query sequence size\n"
00038     "qStart int unsigned not null,      # Alignment start position in query\n"
00039     "qEnd int unsigned not null,        # Alignment end position in query\n"
00040     "tName varchar(255) not null,       # Target sequence name\n"
00041     "tSize int unsigned not null,       # Target sequence size\n"
00042     "tStart int unsigned not null,      # Alignment start position in target\n"
00043     "tEnd int unsigned not null,        # Alignment end position in target\n"
00044     "blockCount int unsigned not null,  # Number of blocks in alignment\n"
00045     "blockSizes longblob not null,      # Size of each block\n"
00046     "qStarts longblob not null, # Start of each block in query.\n"
00047     "tStarts longblob not null, # Start of each block in target.\n";
00048 
00049 static char *indexString = 
00050           "#Indices\n"
00051     "%s"                            /* Optional bin. */
00052     "INDEX(qName(12))\n"
00053 ")\n";
00054 
00055 
00056 struct psl *pslxLoad(char **row)
00057 /* Load a psl from row fetched with select * from psl
00058  * from database.  Dispose of this with pslFree(). */
00059 {
00060 struct psl *ret = pslLoad(row);
00061 int retSize;
00062 sqlStringDynamicArray(row[21],&ret->qSequence, &retSize);
00063 sqlStringDynamicArray(row[22],&ret->tSequence, &retSize);
00064 return ret;
00065 }
00066 
00067 struct psl *pslLoad(char **row)
00068 /* Load a psl from row fetched with select * from psl
00069  * from database.  Dispose of this with pslFree(). */
00070 {
00071 struct psl *ret;
00072 int sizeOne;
00073 
00074 AllocVar(ret);
00075 ret->blockCount = sqlUnsigned(row[17]);
00076 ret->match = sqlUnsigned(row[0]);
00077 ret->misMatch = sqlUnsigned(row[1]);
00078 ret->repMatch = sqlUnsigned(row[2]);
00079 ret->nCount = sqlUnsigned(row[3]);
00080 ret->qNumInsert = sqlUnsigned(row[4]);
00081 ret->qBaseInsert = sqlSigned(row[5]);
00082 ret->tNumInsert = sqlUnsigned(row[6]);
00083 ret->tBaseInsert = sqlSigned(row[7]);
00084 strcpy(ret->strand, row[8]);
00085 ret->qName = cloneString(row[9]);
00086 ret->qSize = sqlUnsigned(row[10]);
00087 ret->qStart = sqlUnsigned(row[11]);
00088 ret->qEnd = sqlUnsigned(row[12]);
00089 ret->tName = cloneString(row[13]);
00090 ret->tSize = sqlUnsigned(row[14]);
00091 ret->tStart = sqlUnsigned(row[15]);
00092 ret->tEnd = sqlUnsigned(row[16]);
00093 sqlUnsignedDynamicArray(row[18], &ret->blockSizes, &sizeOne);
00094 if (sizeOne != ret->blockCount)
00095     {
00096     printf("sizeOne bloxksizes %d bs %d block=%s\n",sizeOne, ret->blockCount,row[18]);
00097     }
00098 assert(sizeOne == ret->blockCount);
00099 sqlUnsignedDynamicArray(row[19], &ret->qStarts, &sizeOne);
00100 if (sizeOne != ret->blockCount)
00101     {
00102     printf("sizeOne qStarts %d bs %d\n",sizeOne, ret->blockCount);
00103     }
00104 assert(sizeOne == ret->blockCount);
00105 sqlUnsignedDynamicArray(row[20], &ret->tStarts, &sizeOne);
00106 if (sizeOne != ret->blockCount)
00107     {
00108     printf("sizeOne tStarts %d bs %d\n",sizeOne, ret->blockCount);
00109     }
00110 assert(sizeOne == ret->blockCount);
00111 return ret;
00112 }
00113 
00114 struct psl *pslCommaIn(char **pS, struct psl *ret)
00115 /* Create a psl out of a comma separated string. 
00116  * This will fill in ret if non-null, otherwise will
00117  * return a new psl */
00118 {
00119 char *s = *pS;
00120 int i;
00121 
00122 if (ret == NULL)
00123     AllocVar(ret);
00124 ret->match = sqlUnsignedComma(&s);
00125 ret->misMatch = sqlUnsignedComma(&s);
00126 ret->repMatch = sqlUnsignedComma(&s);
00127 ret->nCount = sqlUnsignedComma(&s);
00128 ret->qNumInsert = sqlUnsignedComma(&s);
00129 ret->qBaseInsert = sqlSignedComma(&s);
00130 ret->tNumInsert = sqlUnsignedComma(&s);
00131 ret->tBaseInsert = sqlSignedComma(&s);
00132 sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
00133 ret->qName = sqlStringComma(&s);
00134 ret->qSize = sqlUnsignedComma(&s);
00135 ret->qStart = sqlUnsignedComma(&s);
00136 ret->qEnd = sqlUnsignedComma(&s);
00137 ret->tName = sqlStringComma(&s);
00138 ret->tSize = sqlUnsignedComma(&s);
00139 ret->tStart = sqlUnsignedComma(&s);
00140 ret->tEnd = sqlUnsignedComma(&s);
00141 ret->blockCount = sqlUnsignedComma(&s);
00142 s = sqlEatChar(s, '{');
00143 AllocArray(ret->blockSizes, ret->blockCount);
00144 for (i=0; i<ret->blockCount; ++i)
00145     {
00146     ret->blockSizes[i] = sqlUnsignedComma(&s);
00147     }
00148 s = sqlEatChar(s, '}');
00149 s = sqlEatChar(s, ',');
00150 s = sqlEatChar(s, '{');
00151 AllocArray(ret->qStarts, ret->blockCount);
00152 for (i=0; i<ret->blockCount; ++i)
00153     {
00154     ret->qStarts[i] = sqlUnsignedComma(&s);
00155     }
00156 s = sqlEatChar(s, '}');
00157 s = sqlEatChar(s, ',');
00158 s = sqlEatChar(s, '{');
00159 AllocArray(ret->tStarts, ret->blockCount);
00160 for (i=0; i<ret->blockCount; ++i)
00161     {
00162     ret->tStarts[i] = sqlUnsignedComma(&s);
00163     }
00164 s = sqlEatChar(s, '}');
00165 s = sqlEatChar(s, ',');
00166 *pS = s;
00167 return ret;
00168 }
00169 
00170 void pslFree(struct psl **pEl)
00171 /* Free a single dynamically allocated psl such as created
00172  * with pslLoad(). */
00173 {
00174 struct psl *el;
00175 
00176 if ((el = *pEl) == NULL) return;
00177 freeMem(el->qName);
00178 freeMem(el->tName);
00179 freeMem(el->blockSizes);
00180 freeMem(el->qStarts);
00181 freeMem(el->tStarts);
00182 if (el->qSequence)
00183     {
00184     freeMem(el->qSequence[0]);
00185     freeMem(el->qSequence);
00186     }
00187 if (el->tSequence)
00188     {
00189     freeMem(el->tSequence[0]);
00190     freeMem(el->tSequence);
00191     }
00192 freez(pEl);
00193 }
00194 
00195 void pslFreeList(struct psl **pList)
00196 /* Free a list of dynamically allocated psl's */
00197 {
00198 struct psl *el, *next;
00199 
00200 for (el = *pList; el != NULL; el = next)
00201     {
00202     next = el->next;
00203     pslFree(&el);
00204     }
00205 *pList = NULL;
00206 }
00207 
00208 void pslOutput(struct psl *el, FILE *f, char sep, char lastSep) 
00209 /* Print out psl.  Separate fields with sep. Follow last field with lastSep. */
00210 {
00211 int i;
00212 fprintf(f, "%u", el->match);
00213 fputc(sep,f);
00214 fprintf(f, "%u", el->misMatch);
00215 fputc(sep,f);
00216 fprintf(f, "%u", el->repMatch);
00217 fputc(sep,f);
00218 fprintf(f, "%u", el->nCount);
00219 fputc(sep,f);
00220 fprintf(f, "%u", el->qNumInsert);
00221 fputc(sep,f);
00222 fprintf(f, "%d", el->qBaseInsert);
00223 fputc(sep,f);
00224 fprintf(f, "%u", el->tNumInsert);
00225 fputc(sep,f);
00226 fprintf(f, "%d", el->tBaseInsert);
00227 fputc(sep,f);
00228 if (sep == ',') fputc('"',f);
00229 fprintf(f, "%s", el->strand);
00230 if (sep == ',') fputc('"',f);
00231 fputc(sep,f);
00232 if (sep == ',') fputc('"',f);
00233 fprintf(f, "%s", el->qName);
00234 if (sep == ',') fputc('"',f);
00235 fputc(sep,f);
00236 fprintf(f, "%u", el->qSize);
00237 fputc(sep,f);
00238 fprintf(f, "%u", el->qStart);
00239 fputc(sep,f);
00240 fprintf(f, "%u", el->qEnd);
00241 fputc(sep,f);
00242 if (sep == ',') fputc('"',f);
00243 fprintf(f, "%s", el->tName);
00244 if (sep == ',') fputc('"',f);
00245 fputc(sep,f);
00246 fprintf(f, "%u", el->tSize);
00247 fputc(sep,f);
00248 fprintf(f, "%u", el->tStart);
00249 fputc(sep,f);
00250 fprintf(f, "%u", el->tEnd);
00251 fputc(sep,f);
00252 fprintf(f, "%u", el->blockCount);
00253 fputc(sep,f);
00254 if (sep == ',') fputc('{',f);
00255 for (i=0; i<el->blockCount; ++i)
00256     {
00257     fprintf(f, "%u", el->blockSizes[i]);
00258     fputc(',', f);
00259     }
00260 if (sep == ',') fputc('}',f);
00261 fputc(sep,f);
00262 if (sep == ',') fputc('{',f);
00263 for (i=0; i<el->blockCount; ++i)
00264     {
00265     fprintf(f, "%u", el->qStarts[i]);
00266     fputc(',', f);
00267     }
00268 if (sep == ',') fputc('}',f);
00269 fputc(sep,f);
00270 if (sep == ',') fputc('{',f);
00271 for (i=0; i<el->blockCount; ++i)
00272     {
00273     fprintf(f, "%u", el->tStarts[i]);
00274     fputc(',', f);
00275     }
00276 if (sep == ',') fputc('}',f);
00277 if (el->qSequence)
00278     {
00279     fputc(sep,f);
00280     if (sep == ',') fputc('{',f);
00281     for (i=0; i<el->blockCount; ++i)
00282         {
00283         fprintf(f, "%s", el->qSequence[i]);
00284         fputc(',', f);
00285         }
00286     if (sep == ',') fputc('}',f);
00287     fputc(sep,f);
00288     if (sep == ',') fputc('{',f);
00289     for (i=0; i<el->blockCount; ++i)
00290         {
00291         fprintf(f, "%s", el->tSequence[i]);
00292         fputc(',', f);
00293         }
00294     if (sep == ',') fputc('}',f);
00295     }
00296 
00297 fputc(lastSep,f);
00298 if (ferror(f))
00299     {
00300     perror("Error writing psl file\n");
00301     errAbort("\n");
00302     }
00303 }
00304 
00305 /* ----- end autoSql generated part --------------- */
00306 
00307 void pslOutputShort(struct psl *el, FILE *f, char sep, char lastSep) 
00308 /* Print out psl.  Separate fields with sep. Follow last field with lastSep. */
00309 {
00310 fprintf(f, "%u", el->match);
00311 fputc(sep,f);
00312 fprintf(f, "%u", el->misMatch);
00313 fputc(sep,f);
00314 fprintf(f, "%u", el->repMatch);
00315 fputc(sep,f);
00316 fprintf(f, "%u", el->qNumInsert);
00317 fputc(sep,f);
00318 fprintf(f, "%d", el->qBaseInsert);
00319 fputc(sep,f);
00320 fprintf(f, "%u", el->tNumInsert);
00321 fputc(sep,f);
00322 fprintf(f, "%d", el->tBaseInsert);
00323 fputc(sep,f);
00324 if (sep == ',') fputc('"',f);
00325 fprintf(f, "%s", el->strand);
00326 if (sep == ',') fputc('"',f);
00327 fputc(sep,f);
00328 if (sep == ',') fputc('"',f);
00329 fprintf(f, "%s", el->qName);
00330 if (sep == ',') fputc('"',f);
00331 fputc(sep,f);
00332 fprintf(f, "%u", el->qStart);
00333 fputc(sep,f);
00334 fprintf(f, "%u", abs(el->qEnd - el->qStart));
00335 fputc(sep,f);
00336 if (sep == ',') fputc('"',f);
00337 fprintf(f, "%s", el->tName);
00338 if (sep == ',') fputc('"',f);
00339 fputc(sep,f);
00340 fprintf(f, "%u", el->tStart);
00341 fputc(sep,f);
00342 fprintf(f, "%u", abs(el->tEnd - el->tStart));
00343 fputc(sep,f);
00344 fprintf(f, "%u", el->blockCount);
00345 fputc(sep,f);
00346 if (sep == ',') fputc('{',f);
00347 fputc(lastSep,f);
00348 if (ferror(f))
00349     {
00350     perror("Error writing psl file\n");
00351     errAbort("\n");
00352     }
00353 }
00354 
00355 void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep) 
00356 /* Print out selected psl values.  Separate fields with sep. Follow last field with lastSep. */
00357 /* Prints out a better format with bold field headings followed by value */
00358 /* Requires further upstream work to ensure that only the field headers */
00359 /* declared here are printed if replacing an existing psl print function*/
00360 {
00361 const char *headers[] = {"Matches", "Mismatches", "Matches in repeats", "Number of N bases", "Query name", "Size", "Start", "End", "Chromosome", "Strand", "Start", "End"};
00362 char *hformat = "<B>%s:</B> "; /* string for formatted print for headers */
00363 char *uformat = "<B>%s:</B> %u%c"; /* string for formatted print for unsigned variable */
00364 char *targName;
00365 
00366 fprintf(f, uformat, headers[0], el->match, sep);
00367 fprintf(f, uformat, headers[1], el->misMatch, sep);
00368 fprintf(f, uformat, headers[2], el->repMatch, sep);
00369 fprintf(f, uformat, headers[3], el->nCount, sep);
00370 
00371 fprintf(f, hformat, headers[4]);
00372 if (sep == ',') fputc('"',f);
00373 fprintf(f, "%s", el->qName);
00374 if (sep == ',') fputc('"',f);
00375 fputc(sep,f);
00376 
00377 fprintf(f, uformat, headers[5], el->qSize, sep);
00378 fprintf(f, uformat, headers[6], el->qStart, sep);
00379 fprintf(f, uformat, headers[7], el->qEnd, sep);
00380 
00381 fprintf(f, hformat, headers[8]);
00382 if (sep == ',') fputc('"',f);
00383 /* skip leading 'chr' in string to get only chromosome part */
00384 targName = el->tName;
00385 if (startsWith("chr", el->tName))
00386    targName += 3;
00387 fprintf(f, "%s", targName);
00388 
00389 if (sep == ',') fputc('"',f);
00390 fputc(sep,f);
00391 
00392 fprintf(f, hformat, headers[9]);
00393 if (sep == ',') fputc('"',f);
00394 fprintf(f, "%s", el->strand);
00395 if (sep == ',') fputc('"',f);
00396 fputc(sep,f);
00397 
00398 fprintf(f, uformat, headers[10], el->tStart, sep);
00399 fprintf(f, uformat, headers[11], el->tEnd, sep);
00400 
00401 fputc(lastSep,f);
00402 
00403 if (ferror(f))
00404     {
00405     perror("Error writing psl file\n");
00406     errAbort("\n");
00407     }
00408 
00409 }
00410 
00411 struct psl *pslLoadAll(char *fileName)
00412 /* Load all psl's in file. */
00413 {
00414 struct lineFile *lf = pslFileOpen(fileName);
00415 struct psl *pslList = NULL, *psl;
00416 while ((psl = pslNext(lf)) != NULL)
00417     {
00418     slAddHead(&pslList, psl);
00419     }
00420 slReverse(&pslList);
00421 lineFileClose(&lf);
00422 return pslList;
00423 }
00424 
00425 
00426 int pslCmpQuery(const void *va, const void *vb)
00427 /* Compare to sort based on query start. */
00428 {
00429 const struct psl *a = *((struct psl **)va);
00430 const struct psl *b = *((struct psl **)vb);
00431 int dif;
00432 dif = strcmp(a->qName, b->qName);
00433 if (dif == 0)
00434     dif = a->qStart - b->qStart;
00435 return dif;
00436 }
00437 
00438 int pslCmpTarget(const void *va, const void *vb)
00439 /* Compare to sort based on target start. */
00440 {
00441 const struct psl *a = *((struct psl **)va);
00442 const struct psl *b = *((struct psl **)vb);
00443 int dif;
00444 dif = strcmp(a->tName, b->tName);
00445 if (dif == 0)
00446     dif = a->tStart - b->tStart;
00447 return dif;
00448 }
00449 
00450 int pslCmpTargetAndStrand(const void *va, const void *vb)
00451 /* Compare to sort based on target, strand,  tStart. */
00452 {
00453 const struct psl *a = *((struct psl **)va);
00454 const struct psl *b = *((struct psl **)vb);
00455 int dif;
00456 dif = strcmp(a->tName, b->tName);
00457 if (dif == 0)
00458     dif = strcmp(a->strand, b->strand);
00459 if (dif == 0)
00460     dif = a->tStart - b->tStart;
00461 return dif;
00462 }
00463 
00464 
00465 int pslCmpScore(const void *va, const void *vb)
00466 /* Compare to sort based on score (descending). */
00467 {
00468 const struct psl *a = *((struct psl **)va);
00469 const struct psl *b = *((struct psl **)vb);
00470 return pslScore(b) - pslScore(a);
00471 }
00472 
00473 int pslCmpQueryScore(const void *va, const void *vb)
00474 /* Compare to sort based on query then score (descending). */
00475 {
00476 const struct psl *a = *((struct psl **)va);
00477 const struct psl *b = *((struct psl **)vb);
00478 int diff = strcmp(a->qName, b->qName);
00479 if (diff == 0)
00480     diff = pslScore(b) - pslScore(a);
00481 return diff;
00482 }
00483 
00484 int pslCmpMatch(const void *va, const void *vb)
00485 /* Compare to sort based on match */
00486 {
00487 const struct psl *a = *((struct psl **)va);
00488 const struct psl *b = *((struct psl **)vb);
00489 return b->match - a->match;
00490 }
00491 
00492 static void pslLabelColumns(FILE *f)
00493 /* Write column info. */
00494 {
00495 fputs("\n"
00496 "match\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ        \tQ   \tQ    \tQ  \tT        \tT   \tT    \tT  \tblock\tblockSizes \tqStarts\t tStarts\n"
00497 "     \tmatch\tmatch\t   \tcount\tbases\tcount\tbases\t      \tname     \tsize\tstart\tend\tname     \tsize\tstart\tend\tcount\n" 
00498 "---------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
00499 f);
00500 }
00501 
00502 void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType)
00503 /* Write header for extended (possibly protein) psl file. */
00504 {
00505 fprintf(f, "psLayout version 4 %s %s\n", gfTypeName(qType), gfTypeName(tType));
00506 pslLabelColumns(f);
00507 }
00508 
00509 void pslWriteHead(FILE *f)
00510 /* Write head of psl. */
00511 {
00512 fputs("psLayout version 3\n", f);
00513 pslLabelColumns(f);
00514 }
00515 
00516 void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader)
00517 /* Write a psl file from list. */
00518 {
00519 FILE *f;
00520 struct psl *psl;
00521 
00522 f = mustOpen(fileName, "w");
00523 if (writeHeader)
00524     pslWriteHead(f);
00525 for (psl = pslList; psl != NULL; psl = psl->next)
00526     pslTabOut(psl, f);
00527 fclose(f);
00528 }
00529 
00530 void pslxFileOpen(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf)
00531 /* Read header part of psl and make sure it's right.  Return
00532  * sequence types and file handle. */
00533 {
00534 char *line;
00535 int lineSize;
00536 char *words[30];
00537 char *version;
00538 int wordCount;
00539 int i;
00540 enum gfType qt = gftRna,  tt = gftDna;
00541 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00542 
00543 if (!lineFileNext(lf, &line, &lineSize))
00544     warn("%s is empty", fileName);
00545 else
00546     {
00547     if (startsWith("psLayout version", line))
00548         {
00549         wordCount = chopLine(line, words);
00550         if (wordCount < 3)
00551             errAbort("%s is not a psLayout file", fileName);
00552         version = words[2];
00553         if (sameString(version, "3"))
00554             {
00555             }
00556         else if (sameString(version, "4"))
00557             {
00558             qt = gfTypeFromName(words[3]);
00559             tt = gfTypeFromName(words[4]);
00560             }
00561         else
00562             {
00563             errAbort("%s is version %s of psLayout, this program can only handle through version 4",
00564                 fileName,  version);
00565             }
00566         for (i=0; i<4; ++i)
00567             {
00568             if (!lineFileNext(lf, &line, &lineSize))
00569                 errAbort("%s severely truncated", fileName);
00570             }
00571         }
00572     else
00573         {
00574         char *s = cloneString(line);
00575         while (line != NULL && line[0] == '#')
00576             {
00577             freeMem(s);
00578             lineFileNext(lf, &line, &lineSize);
00579             s = cloneString(line);
00580             }
00581         wordCount = chopLine(s, words);
00582         if (wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-'))
00583             errAbort("%s is not a psLayout file", fileName);
00584         else
00585             lineFileReuse(lf); 
00586         freeMem(s);
00587         }
00588     }
00589 *retQueryType = qt;
00590 *retTargetType = tt;
00591 *retLf = lf;
00592 }
00593 
00594 static void pslxFileOpenWithMetaConfig(char *fileName, bool isMetaUnique, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00595 /* Read header part of psl and make sure it's right.  Return
00596  * sequence types and file handle and send meta data to output file f */
00597 {
00598 char *line;
00599 int lineSize;
00600 char *words[30];
00601 char *version;
00602 int wordCount;
00603 int i;
00604 enum gfType qt = gftRna,  tt = gftDna;
00605 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00606 
00607 lineFileSetMetaDataOutput(lf, f);
00608 if (isMetaUnique)
00609     lineFileSetUniqueMetaData(lf);
00610 if (!lineFileNext(lf, &line, &lineSize))
00611     warn("%s is empty", fileName);
00612 else
00613     {
00614     if (startsWith("psLayout version", line))
00615         {
00616         wordCount = chopLine(line, words);
00617         if (wordCount < 3)
00618             errAbort("%s is not a psLayout file", fileName);
00619         version = words[2];
00620         if (sameString(version, "3"))
00621             {
00622             }
00623         else if (sameString(version, "4"))
00624             {
00625             qt = gfTypeFromName(words[3]);
00626             tt = gfTypeFromName(words[4]);
00627             }
00628         else
00629             {
00630             errAbort("%s is version %s of psLayout, this program can only handle through version 4",
00631                 fileName,  version);
00632             }
00633         for (i=0; i<4; ++i)
00634             {
00635             if (!lineFileNext(lf, &line, &lineSize))
00636                 errAbort("%s severely truncated", fileName);
00637             }
00638         }
00639     else
00640         {
00641         char *s = cloneString(line);
00642         boolean eof = FALSE;
00643         while ((line[0] == '#') && (!eof))
00644             {
00645             freeMem(s);
00646             if (!lineFileNext(lf, &line, &lineSize))
00647                 eof = TRUE;
00648             s = cloneString(line);
00649             }
00650         wordCount = chopLine(s, words);
00651         if ((wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-')) && (!eof))
00652             errAbort("%s is not a psLayout file", fileName);
00653         else
00654             lineFileReuse(lf); 
00655         freeMem(s);
00656         }
00657     }
00658 *retQueryType = qt;
00659 *retTargetType = tt;
00660 *retLf = lf;
00661 }
00662 
00663 void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00664 /* Read header part of psl and make sure it's right.  Return
00665  * sequence types and file handle and send meta data to output file f */
00666 {
00667 pslxFileOpenWithMetaConfig(fileName, FALSE, retQueryType, retTargetType, retLf, f);
00668 }
00669 
00670 void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00671 /* Read header part of psl and make sure it's right.  Return
00672  * sequence types and file handle and send only unique meta data to output f */
00673 {
00674 pslxFileOpenWithMetaConfig(fileName, TRUE, retQueryType, retTargetType, retLf, f);
00675 }
00676 
00677 struct lineFile *pslFileOpen(char *fileName)
00678 /* Read header part of psl and make sure it's right. 
00679  * Return line file handle to it. */
00680 {
00681 enum gfType qt, tt;
00682 struct lineFile *lf;
00683 pslxFileOpen(fileName, &qt, &tt, &lf);
00684 return lf;
00685 }
00686 
00687 struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f)
00688 /* Read header part of psl and make sure it's right. 
00689  * Return line file handle to it. */
00690 {
00691 enum gfType qt, tt;
00692 struct lineFile *lf;
00693 pslxFileOpenWithMeta(fileName, &qt, &tt, &lf, f);
00694 return lf;
00695 }
00696 
00697 struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f)
00698 /* Read header part of psl and make sure it's right. 
00699  * Set flag to suppress duplicate header comments.
00700  * Return line file handle to it. */
00701 {
00702 enum gfType qt, tt;
00703 struct lineFile *lf;
00704 pslxFileOpenWithUniqueMeta(fileName, &qt, &tt, &lf, f);
00705 return lf;
00706 }
00707 
00708 struct psl *pslNext(struct lineFile *lf)
00709 /* Read next line from file and convert it to psl.  Return
00710  * NULL at eof. */
00711 {
00712 char *line;
00713 int lineSize;
00714 char *words[32];
00715 int wordCount;
00716 static int lineAlloc = 0;
00717 static char *chopBuf = NULL;
00718 
00719 if (!lineFileNextReal(lf, &line))
00720     {
00721     return NULL;
00722     }
00723 lineSize = strlen(line);
00724 if (lineSize >= lineAlloc)
00725     {
00726     lineAlloc = lineSize+256;
00727     chopBuf = needMoreMem(chopBuf, 0, lineAlloc);
00728     }
00729 memcpy(chopBuf, line, lineSize+1);
00730 wordCount = chopLine(chopBuf, words);
00731 if (wordCount == 21)
00732     {
00733     return pslLoad(words);
00734     }
00735 if (wordCount == 23)
00736     {
00737     return pslxLoad(words);
00738     }
00739 else
00740     {
00741     errAbort("Bad line %d of %s wordCount is %d instead of 21 or 23\n", lf->lineIx, lf->fileName, wordCount);
00742     return NULL;
00743     }
00744 }
00745 
00746 struct psl *pslxLoadLm(char **row, struct lm *lm)
00747 /* Load row into local memory pslx. */
00748 {
00749 struct psl *ret = pslLoadLm(row, lm);
00750 ret->qSequence = lmAlloc(lm, sizeof(ret->qSequence[0]) * ret->blockCount);
00751 sqlStringArray(lmCloneString(lm,row[21]),ret->qSequence, ret->blockCount);
00752 ret->tSequence = lmAlloc(lm, sizeof(ret->tSequence[0]) * ret->blockCount);
00753 sqlStringArray(lmCloneString(lm,row[22]),ret->tSequence, ret->blockCount);
00754 return ret;
00755 }
00756 
00757 struct psl *pslLoadLm(char **row, struct lm *lm)
00758 /* Load row into local memory psl. */
00759 {
00760 struct psl *ret;
00761 
00762 ret = lmAlloc(lm, sizeof(*ret));
00763 ret->blockCount = sqlUnsigned(row[17]);
00764 ret->match = sqlUnsigned(row[0]);
00765 ret->misMatch = sqlUnsigned(row[1]);
00766 ret->repMatch = sqlUnsigned(row[2]);
00767 ret->nCount = sqlUnsigned(row[3]);
00768 ret->qNumInsert = sqlUnsigned(row[4]);
00769 ret->qBaseInsert = sqlSigned(row[5]);
00770 ret->tNumInsert = sqlUnsigned(row[6]);
00771 ret->tBaseInsert = sqlSigned(row[7]);
00772 strcpy(ret->strand, row[8]);
00773 ret->qName = lmCloneString(lm,row[9]);
00774 ret->qSize = sqlUnsigned(row[10]);
00775 ret->qStart = sqlUnsigned(row[11]);
00776 ret->qEnd = sqlUnsigned(row[12]);
00777 ret->tName = lmCloneString(lm, row[13]);
00778 ret->tSize = sqlUnsigned(row[14]);
00779 ret->tStart = sqlUnsigned(row[15]);
00780 ret->tEnd = sqlUnsigned(row[16]);
00781 ret->blockSizes = lmAlloc(lm, sizeof(ret->blockSizes[0]) * ret->blockCount);
00782 sqlUnsignedArray(row[18], ret->blockSizes, ret->blockCount);
00783 ret->qStarts = lmAlloc(lm, sizeof(ret->qStarts[0]) * ret->blockCount);
00784 sqlUnsignedArray(row[19], ret->qStarts, ret->blockCount);
00785 ret->tStarts = lmAlloc(lm, sizeof(ret->tStarts[0]) * ret->blockCount);
00786 sqlUnsignedArray(row[20], ret->tStarts, ret->blockCount);
00787 return ret;
00788 }
00789 
00790 boolean pslIsProtein(const struct psl *psl)
00791 /* is psl a protein psl (are it's blockSizes and scores in protein space) */
00792 {
00793 int lastBlock = psl->blockCount - 1;
00794 
00795 return  (((psl->strand[1] == '+' ) &&
00796     (psl->tEnd == psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])) ||
00797    ((psl->strand[1] == '-') && 
00798     (psl->tStart == (psl->tSize-(psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])))));
00799 }
00800 
00801 int pslCalcMilliBad(struct psl *psl, boolean isMrna)
00802 /* Calculate badness in parts per thousand. */
00803 {
00804 int sizeMul = pslIsProtein(psl) ? 3 : 1;
00805 int qAliSize, tAliSize, aliSize;
00806 int milliBad = 0;
00807 int sizeDif;
00808 int insertFactor;
00809 int total;
00810 
00811 qAliSize = sizeMul * (psl->qEnd - psl->qStart);
00812 tAliSize = psl->tEnd - psl->tStart;
00813 aliSize = min(qAliSize, tAliSize);
00814 if (aliSize <= 0)
00815     return 0;
00816 sizeDif = qAliSize - tAliSize;
00817 if (sizeDif < 0)
00818     {
00819     if (isMrna)
00820         sizeDif = 0;
00821     else
00822         sizeDif = -sizeDif;
00823     }
00824 insertFactor = psl->qNumInsert;
00825 if (!isMrna)
00826     insertFactor += psl->tNumInsert;
00827 
00828 total = (sizeMul * (psl->match + psl->repMatch + psl->misMatch));
00829 if (total != 0)
00830     milliBad = (1000 * (psl->misMatch*sizeMul + insertFactor + round(3*log(1+sizeDif)))) / total;
00831 return milliBad;
00832 }
00833 
00834 int pslScore(const struct psl *psl)
00835 /* Return score for psl. */
00836 {
00837 int sizeMul = pslIsProtein(psl) ? 3 : 1;
00838 
00839 return sizeMul * (psl->match + ( psl->repMatch>>1)) - 
00840         sizeMul * psl->misMatch - psl->qNumInsert - psl->tNumInsert;
00841 }
00842 
00843 int pslCmpScoreDesc(const void *va, const void *vb)
00844 /* Compare to sort based on score. */
00845 {
00846 const struct psl *a = *((struct psl **)va);
00847 const struct psl *b = *((struct psl **)vb);
00848 return pslScore(b) - pslScore(a);
00849 }
00850 
00851 
00852 struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack)
00853 /* Convert from psl to ffAli format.  In some cases you can pass NULL
00854  * for needle and haystack - depending what the post-processing is going
00855  * to be. */
00856 {
00857 struct ffAli *ffList = NULL, *ff;
00858 int blockCount = psl->blockCount;
00859 unsigned *blockSizes = psl->blockSizes;
00860 unsigned *qStarts = psl->qStarts;
00861 unsigned *tStarts = psl->tStarts;
00862 int size;
00863 int i;
00864 
00865 for (i=0; i<blockCount; ++i)
00866     {
00867     size = blockSizes[i];
00868     AllocVar(ff);
00869     ff->left = ffList;
00870     ffList = ff;
00871     ff->nStart = ff->nEnd = needle + qStarts[i];
00872     ff->nEnd += size;
00873     ff->hStart = ff->hEnd = haystack + tStarts[i];
00874     ff->hEnd += size;
00875     }
00876 ffList = ffMakeRightLinks(ffList);
00877 return ffList;
00878 }
00879 
00880 struct psl *pslFromFakeFfAli(struct ffAli *ff, 
00881         DNA *needle, DNA *haystack, char strand,
00882         char *qName, int qSize, char *tName, int tSize)
00883 /* This will create a basic psl structure from a sorted series of ffAli
00884  * blocks.  The fields that would need actual sequence to be filled in
00885  * are left zero however - fields including match, repMatch, mismatch. */
00886 {
00887 struct psl *psl;
00888 unsigned *blockSizes;
00889 unsigned *qStarts;
00890 unsigned *tStarts;
00891 int blockCount;
00892 int i;
00893 int nStart, hStart;
00894 int nEnd, hEnd;
00895 
00896 AllocVar(psl);
00897 psl->blockCount = blockCount = ffAliCount(ff);
00898 psl->blockSizes = AllocArray(blockSizes, blockCount);
00899 psl->qStarts = AllocArray(qStarts, blockCount);
00900 psl->tStarts = AllocArray(tStarts, blockCount);
00901 psl->qName = cloneString(qName);
00902 psl->qSize = qSize;
00903 psl->tName = cloneString(tName);
00904 psl->tSize = tSize;
00905 psl->strand[0] = strand;
00906 
00907 for (i=0; i<blockCount; ++i)
00908     {
00909     nStart = ff->nStart - needle;
00910     nEnd = ff->nEnd - needle;
00911     hStart = ff->hStart - haystack;
00912     hEnd = ff->hEnd - haystack;
00913     blockSizes[i] = nEnd - nStart;
00914     qStarts[i] = nStart;
00915     tStarts[i] = hStart;
00916     if (i == 0)
00917        {
00918        psl->qStart = nStart;
00919        psl->tStart = hStart;
00920        }
00921     if (i == blockCount-1)
00922        {
00923        psl->qEnd = nEnd;
00924        psl->tEnd = hEnd;
00925        }
00926     ff = ff->right;
00927     }
00928 if (strand == '-')
00929     {
00930     reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
00931     }
00932 return psl;
00933 }
00934 
00935 struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
00936         int targetOffset)
00937 /* Convert from psl to ffAli format.  Clip to parts that we actually
00938  * have sequence for. */
00939 {
00940 struct ffAli *ffList = NULL, *ff;
00941 DNA *needle = query->dna;
00942 DNA *haystack = target->dna;
00943 int blockCount = psl->blockCount;
00944 unsigned *blockSizes = psl->blockSizes;
00945 unsigned *qStarts = psl->qStarts;
00946 unsigned *tStarts = psl->tStarts;
00947 int size;
00948 int i;
00949 int tMin = targetOffset;
00950 int tMax = targetOffset + target->size;
00951 int tStart, tEnd;
00952 int clipStart, clipEnd, clipOffset, clipSize;
00953 
00954 for (i=0; i<blockCount; ++i)
00955     {
00956     clipStart = tStart = tStarts[i];
00957     size = blockSizes[i];
00958     clipEnd = tEnd = tStart + size;
00959     if (tStart < tMax && tEnd > tMin)
00960         {
00961         if (clipStart < tMin) clipStart = tMin;
00962         if (clipEnd > tMax) clipEnd = tMax;
00963         clipOffset = clipStart - tStart;
00964         clipSize = clipEnd - clipStart;
00965         AllocVar(ff);
00966         ff->left = ffList;
00967         ffList = ff;
00968         ff->nStart = ff->nEnd = needle + qStarts[i] + clipOffset;
00969         ff->nEnd += clipSize;
00970         ff->hStart = ff->hEnd = haystack + clipStart - targetOffset;
00971         ff->hEnd += clipSize;
00972         }
00973     }
00974 ffList = ffMakeRightLinks(ffList);
00975 ffCountGoodEnds(ffList);
00976 return ffList;
00977 }
00978 
00979 int pslOrientation(struct psl *psl)
00980 /* Translate psl strand + or - to orientation +1 or -1 */
00981 {
00982 if (psl->strand[1] != '\0')
00983     {
00984     /* translated blat */
00985     if (psl->strand[0] != psl->strand[1])
00986         return -1;
00987     else
00988         return 1;
00989     }
00990 else
00991     {
00992     if (psl->strand[0] == '-')
00993         return -1;
00994     else
00995         return 1;
00996     }
00997 }
00998 
00999 int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
01000 /* Return >0 if introns make it look like alignment is on + strand,
01001  *        <0 if introns make it look like alignment is on - strand,
01002  *        0 if can't tell.  The absolute value of the return indicates
01003  * how many splice sites we've seen supporting the orientation.
01004  * Sequence should NOT be reverse complemented.  */
01005 {
01006 int intronDir = 0;
01007 int oneDir;
01008 int i;
01009 DNA *dna = genoSeq->dna;
01010 
01011 /* code below doesn't support negative target strand (translated blat) */
01012 if (psl->strand[1] == '-')
01013     errAbort("pslWeightedIntronOrientation doesn't support a negative target strand");
01014 
01015 for (i=1; i<psl->blockCount; ++i)
01016     {
01017     int iStart, iEnd, blockSize = psl->blockSizes[i-1];
01018     if (psl->qStarts[i-1] + blockSize == psl->qStarts[i])
01019         {
01020         iStart = psl->tStarts[i-1] + psl->blockSizes[i-1] - offset;
01021         iEnd = psl->tStarts[i] - offset;
01022         oneDir = intronOrientation(dna+iStart, dna+iEnd);
01023         intronDir += oneDir;
01024         }
01025     }
01026 return intronDir;
01027 }
01028 
01029 int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
01030 /* Return 1 if introns make it look like alignment is on + strand,
01031  *       -1 if introns make it look like alignment is on - strand,
01032  *        0 if can't tell.
01033  * Sequence should NOT be reverse complemented.  */
01034 {
01035 int intronDir = pslWeightedIntronOrientation(psl, genoSeq, offset);
01036 if (intronDir < 0)
01037     intronDir = -1;
01038 else if (intronDir > 0)
01039     intronDir = 1;
01040 return intronDir;
01041 }
01042 
01043 boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset)
01044 /* Return TRUE if there's a probable intron. Sequence should NOT be
01045  * reverse complemented.*/
01046 {
01047 int blockCount = psl->blockCount, i;
01048 unsigned *tStarts = psl->tStarts;
01049 unsigned *blockSizes = psl->blockSizes;
01050 unsigned *qStarts = psl->qStarts;
01051 int blockSize, start, end;
01052 DNA *dna = seq->dna;
01053 
01054 for (i=1; i<blockCount; ++i)
01055     {
01056     blockSize = blockSizes[i-1];
01057     start = qStarts[i-1]+blockSize;
01058     end = qStarts[i];
01059     if (start == end)
01060         {
01061         start = tStarts[i-1] + blockSize;
01062         end = tStarts[i];
01063         if (psl->strand[1] == '-')
01064             reverseIntRange(&start, &end, psl->tSize);
01065         start -= seqOffset;
01066         end -= seqOffset;
01067         if (intronOrientation(dna+start, dna+end) != 0)
01068             return TRUE;
01069         }
01070     }
01071 return FALSE;
01072 }
01073 
01074 void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail)
01075 /* Find the length of "tails" (rather than extensions) implied by psl. */
01076 {
01077 int orientation = pslOrientation(psl);
01078 int qFloppyStart, qFloppyEnd;
01079 int tFloppyStart, tFloppyEnd;
01080 
01081 if (orientation > 0)
01082     {
01083     qFloppyStart = psl->qStart;
01084     qFloppyEnd = psl->qSize - psl->qEnd;
01085     }
01086 else
01087     {
01088     qFloppyStart = psl->qSize - psl->qEnd;
01089     qFloppyEnd = psl->qStart;
01090     }
01091 tFloppyStart = psl->tStart;
01092 tFloppyEnd = psl->tSize - psl->tEnd;
01093 *retStartTail = min(qFloppyStart, tFloppyStart);
01094 *retEndTail = min(qFloppyEnd, tFloppyEnd);
01095 }
01096 
01097 static void rcSeqs(char **seqs, unsigned blockCount, unsigned *blockSizes)
01098 /* reverses complement sequences in list, maintain property that all strings
01099  * are in one malloc block.   blockSizes should already be reversed. */
01100 {
01101 char *buf, *next;
01102 int i, memSz = 0;
01103 
01104 /* get a new memory block for strings */
01105 for (i = 0; i < blockCount; i++)
01106     memSz += blockSizes[i]+1;
01107 next = buf = needLargeMem(memSz);
01108 
01109 /* reverse compliment and copy to new memory block */
01110 for (i = blockCount-1; i >= 0; i--)
01111     {
01112     int len = strlen(seqs[i]);
01113     reverseComplement(seqs[i], len);
01114     memcpy(next, seqs[i], len+1);
01115     next += len+1;
01116     }
01117 
01118 /* swap memory and update pointers */
01119 freeMem(seqs[0]);
01120 seqs[0] = buf;
01121 next = buf;
01122 
01123 for (i = 0; i < blockCount; i++)
01124     {
01125     seqs[i] = next;
01126     next += blockSizes[i]+1;
01127     }
01128 }
01129 
01130 void pslRcBoth(struct psl *psl)
01131 /* Swap around things in psl so it works as if the alignment
01132  * was done on the reverse strand of the target. */
01133 {
01134 int tSize = psl->tSize, qSize = psl->qSize;
01135 int blockCount = psl->blockCount, i;
01136 unsigned *tStarts = psl->tStarts, *qStarts = psl->qStarts, *blockSizes = psl->blockSizes;
01137 
01138 reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
01139 for (i=0; i<blockCount; ++i)
01140     {
01141     tStarts[i] = (int)tSize - ((int)tStarts[i] + (int)blockSizes[i]);
01142     qStarts[i] = (int)qSize - ((int)qStarts[i] + (int)blockSizes[i]);
01143     }
01144 reverseUnsigned(tStarts, blockCount);
01145 reverseUnsigned(qStarts, blockCount);
01146 reverseUnsigned(blockSizes, blockCount);
01147 }
01148 
01149 void pslRc(struct psl *psl)
01150 /* reverse-complement a PSL alignment.  This makes target strand explicit. */
01151 {
01152 int i;
01153 
01154 /* swap strand, forcing target to have an explict strand */
01155 psl->strand[0] = (psl->strand[0] != '-') ? '-' : '+';
01156 psl->strand[1] = (psl->strand[1] != '-') ? '-' : '+';
01157 
01158 for (i = 0; i < psl->blockCount; i++)
01159     {
01160     psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
01161     psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
01162     }
01163 reverseUnsigned(psl->tStarts, psl->blockCount);
01164 reverseUnsigned(psl->qStarts, psl->blockCount);
01165 reverseUnsigned(psl->blockSizes, psl->blockCount);
01166 if (psl->qSequence != NULL)
01167     {
01168     rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes);
01169     rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes);
01170     }
01171 }
01172 
01173 /* macro to swap to variables */
01174 #define swapVars(a, b, tmp) ((tmp) = (a), (a) = (b), (b) = (tmp))
01175 
01176 static void swapBlocks(struct psl *psl)
01177 /* Swap the blocks in a psl without reverse complementing them. */
01178 {
01179 int i;
01180 unsigned utmp;
01181 char *stmp; 
01182 for (i = 0; i < psl->blockCount; i++)
01183     {
01184     swapVars(psl->qStarts[i], psl->tStarts[i], utmp);
01185     if (psl->qSequence != NULL)
01186         swapVars(psl->qSequence[i], psl->tSequence[i], stmp);
01187     }
01188 }
01189 
01190 static void swapRCBlocks(struct psl *psl)
01191 /* Swap and reverse complement blocks in a psl. Other psl fields must
01192  * be modified first */
01193 {
01194 int i;
01195 unsigned *uatmp;
01196 char **satmp;
01197 reverseUnsigned(psl->tStarts, psl->blockCount);
01198 reverseUnsigned(psl->qStarts, psl->blockCount);
01199 reverseUnsigned(psl->blockSizes, psl->blockCount);
01200 swapVars(psl->tStarts, psl->qStarts, uatmp);
01201 
01202 /* qSize and tSize have already been swapped */
01203 for (i = 0; i < psl->blockCount; i++)
01204     {
01205     psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
01206     psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
01207     }
01208 if (psl->qSequence != NULL)
01209     {
01210     /* note: all block sequences are stored in one malloc block, which is
01211      * entry zero */
01212     rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes);
01213     rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes);
01214     swapVars(psl->qSequence, psl->tSequence, satmp);
01215     }
01216 }
01217 
01218 void pslSwap(struct psl *psl, boolean noRc)
01219 /* swap query and target in psl.  If noRc is TRUE, don't reverse-complement
01220  * PSL if needed, instead make target strand explict. */
01221 {
01222 int itmp;
01223 unsigned utmp;
01224 char ctmp, *stmp; 
01225 swapVars(psl->qBaseInsert, psl->tBaseInsert, utmp);
01226 swapVars(psl->tNumInsert, psl->qNumInsert, utmp);
01227 swapVars(psl->qName, psl->tName, stmp);
01228 swapVars(psl->qSize, psl->tSize, utmp);
01229 swapVars(psl->qStart, psl->tStart, itmp);
01230 swapVars(psl->qEnd, psl->tEnd, itmp);
01231 
01232 /* handle strand and block copy */
01233 if (psl->strand[1] != '\0')
01234     {
01235     /* translated */
01236     swapVars(psl->strand[0], psl->strand[1], ctmp);
01237     swapBlocks(psl);
01238     }
01239 else if (noRc)
01240     {
01241     /* untranslated with no reverse complement */
01242     psl->strand[1] = psl->strand[0];
01243     psl->strand[0] = '+';
01244     swapBlocks(psl);
01245     }
01246 else
01247     {
01248     /* untranslated */
01249     if (psl->strand[0] == '+')
01250         swapBlocks(psl);
01251     else
01252         swapRCBlocks(psl);
01253     }
01254 }
01255 
01256 void pslTargetOffset(struct psl *psl, int offset)
01257 /* Add offset to target positions in psl. */
01258 {
01259 int i, blockCount = psl->blockCount;
01260 unsigned *tStarts = psl->tStarts;
01261 psl->tStart += offset;
01262 psl->tEnd += offset;
01263 for (i=0; i<blockCount; ++i)
01264    tStarts[i] += offset;
01265 }
01266 
01267 void pslDump(struct psl *psl, FILE *f)
01268 /* Dump most of PSL to file - for debugging. */
01269 {
01270 int i;
01271 fprintf(f, "<PRE>\n");
01272 fprintf(f, "psl %s:%d-%d %s %s:%d-%d %d\n", 
01273         psl->qName, psl->qStart, psl->qEnd, psl->strand,
01274         psl->tName, psl->tStart, psl->tEnd, psl->blockCount);
01275 for (i=0; i<psl->blockCount; ++i)
01276     fprintf(f, "  size %d, qStart %d, tStart %d\n", 
01277         psl->blockSizes[i], psl->qStarts[i], psl->tStarts[i]);
01278 fprintf(f, "</PRE>");
01279 }
01280 
01281 static void pslRecalcBounds(struct psl *psl)
01282 /* Calculate qStart/qEnd tStart/tEnd at top level to be consistent
01283  * with blocks. */
01284 {
01285 int qStart, qEnd, tStart, tEnd, size;
01286 int last = psl->blockCount - 1;
01287 qStart = psl->qStarts[0];
01288 tStart = psl->tStarts[0];
01289 size = psl->blockSizes[last];
01290 qEnd = psl->qStarts[last] + size;
01291 tEnd = psl->tStarts[last] + size;
01292 if (psl->strand[0] == '-')
01293     reverseIntRange(&qStart, &qEnd, psl->qSize);
01294 if (psl->strand[1] == '-')
01295     reverseIntRange(&tStart, &tEnd, psl->tSize);
01296 psl->qStart = qStart;
01297 psl->qEnd = qEnd;
01298 psl->tStart = tStart;
01299 psl->tEnd = tEnd;
01300 }
01301 
01302 struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax)
01303 /* Return psl trimmed to fit inside tMin/tMax.  Note this does not
01304  * update the match/misMatch and related fields. */
01305 {
01306 int newSize;
01307 int oldBlockCount = oldPsl->blockCount;
01308 boolean tIsRc = (oldPsl->strand[1] == '-');
01309 int newBlockCount = 0, completeBlockCount = 0;
01310 int i;
01311 struct psl *newPsl = NULL;
01312 int tMn = tMin, tMx = tMax;   /* tMin/tMax adjusted for strand. */
01313 
01314 /* Deal with case where we're completely trimmed out quickly. */
01315 newSize = rangeIntersection(oldPsl->tStart, oldPsl->tEnd, tMin, tMax);
01316 if (newSize <= 0)
01317     return NULL;
01318 
01319 if (tIsRc)
01320     reverseIntRange(&tMn, &tMx, oldPsl->tSize);
01321 
01322 /* Count how many blocks will survive trimming. */
01323 oldBlockCount = oldPsl->blockCount;
01324 for (i=0; i<oldBlockCount; ++i)
01325     {
01326     int s = oldPsl->tStarts[i];
01327     int e = s + oldPsl->blockSizes[i];
01328     int sz = e - s;
01329     int overlap;
01330     if ((overlap = rangeIntersection(s, e, tMn, tMx)) > 0)
01331         ++newBlockCount;
01332     if (overlap == sz)
01333         ++completeBlockCount;
01334     }
01335 
01336 if (newBlockCount == 0)
01337     return NULL;
01338 
01339 /* Allocate new psl and fill in what we already know. */
01340 AllocVar(newPsl);
01341 strcpy(newPsl->strand, oldPsl->strand);
01342 newPsl->qName = cloneString(oldPsl->qName);
01343 newPsl->qSize = oldPsl->qSize;
01344 newPsl->tName = cloneString(oldPsl->tName);
01345 newPsl->tSize = oldPsl->tSize;
01346 newPsl->blockCount = newBlockCount;
01347 AllocArray(newPsl->blockSizes, newBlockCount);
01348 AllocArray(newPsl->qStarts, newBlockCount);
01349 AllocArray(newPsl->tStarts, newBlockCount);
01350 
01351 /* Fill in blockSizes, qStarts, tStarts with real data. */
01352 newBlockCount = completeBlockCount = 0;
01353 for (i=0; i<oldBlockCount; ++i)
01354     {
01355     int oldSz = oldPsl->blockSizes[i];
01356     int sz = oldSz;
01357     int tS = oldPsl->tStarts[i];
01358     int tE = tS + sz;
01359     int qS = oldPsl->qStarts[i];
01360     int qE = qS + sz;
01361     if (rangeIntersection(tS, tE, tMn, tMx) > 0)
01362         {
01363         int diff;
01364         if ((diff = (tMn - tS)) > 0)
01365             {
01366             tS += diff;
01367             qS += diff;
01368             sz -= diff;
01369             }
01370         if ((diff = (tE - tMx)) > 0)
01371             {
01372             tE -= diff;
01373             qE -= diff;
01374             sz -= diff;
01375             }
01376         newPsl->qStarts[newBlockCount] = qS;
01377         newPsl->tStarts[newBlockCount] = tS;
01378         newPsl->blockSizes[newBlockCount] = sz;
01379         ++newBlockCount;
01380         if (sz == oldSz)
01381             ++completeBlockCount;
01382         }
01383     }
01384 pslRecalcBounds(newPsl);
01385 return newPsl;
01386 }
01387 
01388 struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax)
01389 /* Return psl trimmed to fit inside qMin/qMax.  Note this does not
01390  * update the match/misMatch and related fields. */
01391 {
01392 int newSize;
01393 int oldBlockCount = oldPsl->blockCount;
01394 boolean qIsRc = (oldPsl->strand[0] == '-');
01395 int newBlockCount = 0, completeBlockCount = 0;
01396 int i;
01397 struct psl *newPsl = NULL;
01398 int qMn = qMin, qMx = qMax;   /* qMin/qMax adjusted for strand. */
01399 
01400 /* Deal with case where we're completely trimmed out quickly. */
01401 newSize = rangeIntersection(oldPsl->qStart, oldPsl->qEnd, qMin, qMax);
01402 if (newSize <= 0)
01403     return NULL;
01404 
01405 if (qIsRc)
01406     reverseIntRange(&qMn, &qMx, oldPsl->qSize);
01407 
01408 /* Count how many blocks will survive trimming. */
01409 oldBlockCount = oldPsl->blockCount;
01410 for (i=0; i<oldBlockCount; ++i)
01411     {
01412     int s = oldPsl->qStarts[i];
01413     int e = s + oldPsl->blockSizes[i];
01414     int sz = e - s;
01415     int overlap;
01416     if ((overlap = rangeIntersection(s, e, qMn, qMx)) > 0)
01417         ++newBlockCount;
01418     if (overlap == sz)
01419         ++completeBlockCount;
01420     }
01421 
01422 if (newBlockCount == 0)
01423     return NULL;
01424 
01425 /* Allocate new psl and fill in what we already know. */
01426 AllocVar(newPsl);
01427 strcpy(newPsl->strand, oldPsl->strand);
01428 newPsl->qName = cloneString(oldPsl->qName);
01429 newPsl->qSize = oldPsl->qSize;
01430 newPsl->tName = cloneString(oldPsl->tName);
01431 newPsl->tSize = oldPsl->tSize;
01432 newPsl->blockCount = newBlockCount;
01433 AllocArray(newPsl->blockSizes, newBlockCount);
01434 AllocArray(newPsl->qStarts, newBlockCount);
01435 AllocArray(newPsl->tStarts, newBlockCount);
01436 
01437 /* Fill in blockSizes, qStarts, tStarts with real data. */
01438 newBlockCount = completeBlockCount = 0;
01439 for (i=0; i<oldBlockCount; ++i)
01440     {
01441     int oldSz = oldPsl->blockSizes[i];
01442     int sz = oldSz;
01443     int qS = oldPsl->qStarts[i];
01444     int qE = qS + sz;
01445     int tS = oldPsl->tStarts[i];
01446     int tE = tS + sz;
01447     if (rangeIntersection(qS, qE, qMn, qMx) > 0)
01448         {
01449         int diff;
01450         if ((diff = (qMn - qS)) > 0)
01451             {
01452             tS += diff;
01453             qS += diff;
01454             sz -= diff;
01455             }
01456         if ((diff = (qE - qMx)) > 0)
01457             {
01458             tE -= diff;
01459             qE -= diff;
01460             sz -= diff;
01461             }
01462         newPsl->qStarts[newBlockCount] = qS;
01463         newPsl->tStarts[newBlockCount] = tS;
01464         newPsl->blockSizes[newBlockCount] = sz;
01465         ++newBlockCount;
01466         if (sz == oldSz)
01467             ++completeBlockCount;
01468         }
01469     }
01470 pslRecalcBounds(newPsl);
01471 return newPsl;
01472 }
01473 char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen)
01474 /* Get SQL required to create PSL table.  Options is a bit set consisting
01475  * of PSL_TNAMEIX, PSL_WITH_BIN, and PSL_XA_FORMAT.  tNameIdxLen is
01476  * the number of characters in target name to index.  If greater than
01477  * zero, must specify PSL_TNAMEIX.  If zero and PSL_TNAMEIX is specified,
01478  * to will default to 8. */
01479 {
01480 struct dyString *sqlCmd = newDyString(2048);
01481 char *sqlCmdStr;
01482 char binIx[32];
01483 
01484 binIx[0] = '\0';
01485 
01486 /* check and default tNameIdxLen */
01487 if ((tNameIdxLen > 0) && !(options & PSL_TNAMEIX))
01488     errAbort("pslGetCreateSql: must specify PSL_TNAMEIX with tNameIdxLen > 0");
01489 if ((options & PSL_TNAMEIX) && (tNameIdxLen == 0))
01490     tNameIdxLen = 8;
01491 
01492 /* setup tName and bin index fields */
01493 if (options & PSL_WITH_BIN)
01494     {
01495     if (options & PSL_TNAMEIX)
01496         safef(binIx, sizeof(binIx), "INDEX(tName(%d),bin),\n", tNameIdxLen);
01497     else
01498         safef(binIx, sizeof(binIx), "INDEX(bin),\n");
01499     }
01500 dyStringPrintf(sqlCmd, createString, table, 
01501     ((options & PSL_WITH_BIN) ? "bin smallint unsigned not null,\n" : ""));
01502 if (options & PSL_XA_FORMAT)
01503     {
01504     dyStringPrintf(sqlCmd, "qSeq longblob not null,\n");
01505     dyStringPrintf(sqlCmd, "tSeq longblob not null,\n");
01506     }
01507 dyStringPrintf(sqlCmd, indexString, binIx);
01508 sqlCmdStr = cloneString(sqlCmd->string);
01509 dyStringFree(&sqlCmd);
01510 return sqlCmdStr;
01511 }
01512 
01513 static void printPslDesc(char* pslDesc, FILE* out, struct psl* psl)
01514 /* print description of a PSL on first error */
01515 {
01516 fprintf(out, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n",
01517         psl->qName, psl->qStart, psl->qEnd,
01518         psl->tName, psl->tStart, psl->tEnd,
01519         psl->strand, pslDesc);
01520 }
01521 
01522 static void chkRanges(char* pslDesc, FILE* out, struct psl* psl,
01523                       char* pName, char* pLabel, char pCLabel, char pStrand,
01524                       unsigned pSize, unsigned pStart, unsigned pEnd,
01525                       unsigned blockCount, unsigned* blockSizes,
01526                       unsigned* pBlockStarts, int* errCountPtr)
01527 /* check the target or query ranges in a PSL, increment errorCnt */
01528 {
01529 int errCount = *errCountPtr;
01530 unsigned iBlk, prevBlkEnd = 0;
01531 
01532 if (pStart >= pEnd)
01533     {
01534     if (errCount == 0)
01535         printPslDesc(pslDesc, out, psl);
01536     fprintf(out, "\t%s %cStart %u >= %cEnd %u\n",
01537             pName, pCLabel, pStart, pCLabel, pEnd);
01538     errCount++;
01539     }
01540 if (pEnd > pSize)
01541     {
01542     if (errCount == 0)
01543         printPslDesc(pslDesc, out, psl);
01544     fprintf(out, "\t%s %cEnd %u >= %cSize %u\n",
01545             pName, pCLabel, pEnd, pCLabel, pSize);
01546     errCount++;
01547     }
01548 for (iBlk = 0; iBlk < blockCount; iBlk++)
01549     {
01550     unsigned blkStart = pBlockStarts[iBlk];
01551     unsigned blkEnd = blkStart+blockSizes[iBlk];
01552     /* translate stand to genomic coords */
01553     unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd);
01554     unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart);
01555 
01556     if ((pSize > 0) && (blkEnd > pSize))
01557         {
01558         if (errCount == 0)
01559             printPslDesc(pslDesc, out, psl);
01560         fprintf(out, "\t%s %s block %u end %u > %cSize %u\n",
01561                 pName, pLabel, iBlk, blkEnd, pCLabel, pSize);
01562         errCount++;
01563         }
01564     if (gBlkStart < pStart)
01565         {
01566         if (errCount == 0)
01567             printPslDesc(pslDesc, out, psl);
01568         fprintf(out, "\t%s %s block %u start %u < %cStart %u\n",
01569                 pName, pLabel, iBlk, gBlkStart, pCLabel, pStart);
01570         errCount++;
01571         }
01572     if (gBlkStart >= pEnd)
01573         {
01574         if (errCount == 0)
01575             printPslDesc(pslDesc, out, psl);
01576         fprintf(out, "\t%s %s block %u start %u >= %cEnd %u\n",
01577                 pName, pLabel, iBlk, gBlkStart, pCLabel, pEnd);
01578         errCount++;
01579         }
01580     if (gBlkEnd < pStart)
01581         {
01582         if (errCount == 0)
01583             printPslDesc(pslDesc, out, psl);
01584         fprintf(out, "\t%s %s block %u end %u < %cStart %u\n",
01585                 pName, pLabel, iBlk, gBlkEnd, pCLabel, pStart);
01586         errCount++;
01587         }
01588     if (gBlkEnd > pEnd)
01589         {
01590         if (errCount == 0)
01591             printPslDesc(pslDesc, out, psl);
01592         fprintf(out, "\t%s %s block %u end %u > %cEnd %u\n",
01593                 pName, pLabel, iBlk, gBlkEnd, pCLabel, pEnd);
01594         errCount++;
01595         }
01596     if ((iBlk > 0) && (blkStart < prevBlkEnd))
01597         {
01598         if (errCount == 0)
01599             printPslDesc(pslDesc, out, psl);
01600         fprintf(out, "\t%s %s block %u start %u < previous block end %u\n",
01601                 pName, pLabel, iBlk, blkStart, prevBlkEnd);
01602         errCount++;
01603         }
01604     prevBlkEnd = blkEnd;
01605     }
01606 *errCountPtr = errCount;
01607 }
01608 
01609 int pslCheck(char *pslDesc, FILE* out, struct psl* psl)
01610 /* Validate a PSL for consistency.  pslDesc is printed the error messages
01611  * to file out (open /dev/null to discard). Return count of errors. */
01612 {
01613 static char* VALID_STRANDS[] = {
01614     "+", "-", "++", "+-", "-+", "--", NULL
01615 };
01616 int i, errCount = 0;
01617 char strand;
01618 boolean isProt = FALSE;
01619 
01620 /* check strand value */
01621 for (i = 0; VALID_STRANDS[i] != NULL; i++)
01622     {
01623     if (strcmp(psl->strand, VALID_STRANDS[i]) == 0)
01624         break;
01625     }
01626 if (VALID_STRANDS[i] == NULL)
01627     {
01628     if (errCount == 0)
01629         printPslDesc(pslDesc, out, psl);
01630     fprintf(out, "\tinvalid PSL strand: \"%s\"\n", psl->strand);
01631     errCount++;
01632     }
01633 
01634 /* check target */
01635 if (pslIsProtein(psl))
01636     {
01637     isProt = TRUE;
01638     for (i = 0; i < psl->blockCount ; i++)
01639         psl->blockSizes[i] *= 3;
01640     }
01641 
01642 strand = ((psl->strand[1] == '\0') ? '+' : psl->strand[1]);
01643 chkRanges(pslDesc, out, psl, psl->tName, "target", 't',
01644           strand, psl->tSize, psl->tStart, psl->tEnd,
01645           psl->blockCount, psl->blockSizes, psl->tStarts,
01646           &errCount);
01647 if (isProt)
01648     {
01649     for (i = 0; i < psl->blockCount ; i++)
01650         psl->blockSizes[i] /= 3;
01651     }
01652 
01653 /* check query */
01654 strand = psl->strand[0];
01655 chkRanges(pslDesc, out, psl, psl->qName, "query", 'q',
01656           strand, psl->qSize, psl->qStart, psl->qEnd,
01657           psl->blockCount, psl->blockSizes, psl->qStarts,
01658           &errCount);
01659 
01660 return errCount;
01661 }
01662 
01663 struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName)
01664 /* read a list of psls and return results in hash of binKeeper structure for fast query*/
01665 {
01666 struct binKeeper *bk; 
01667 struct psl *psl;
01668 struct lineFile *sf = lineFileOpen(sizeFileName, TRUE);
01669 struct lineFile *pf = lineFileOpen(pslFileName , TRUE);
01670 struct hash *hash = newHash(0);
01671 char *chromRow[2];
01672 char *row[21] ;
01673 
01674 while (lineFileRow(sf, chromRow))
01675     {
01676     char *name = chromRow[0];
01677     int size = lineFileNeedNum(sf, chromRow, 1);
01678 
01679     if (hashLookup(hash, name) != NULL)
01680         warn("Duplicate %s, ignoring all but first\n", name);
01681     else
01682         {
01683         bk = binKeeperNew(0, size);
01684         assert(size > 1);
01685         hashAdd(hash, name, bk);
01686         }
01687     }
01688 while (lineFileNextRow(pf, row, ArraySize(row)))
01689     {
01690     psl = pslLoad(row);
01691     bk = hashMustFindVal(hash, psl->tName);
01692     binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
01693     }
01694 lineFileClose(&pf);
01695 lineFileClose(&sf);
01696 return hash;
01697 }
01698 
01699 static int countInitialChars(char *s, char c)
01700 /* Count number of initial chars in s that match c. */
01701 {
01702 int count = 0;
01703 char d;
01704 while ((d = *s++) != 0)
01705     {
01706     if (c == d)
01707         ++count;
01708     else
01709         break;
01710     }
01711 return count;
01712 }
01713 
01714 static int countTerminalChars(char *s, char c)
01715 /* Count number of terminal chars in s that match c. */
01716 {
01717 int len = strlen(s), i;
01718 int count = 0;
01719 for (i=len-1; i>=0; --i)
01720     {
01721     if (c == s[i])
01722         ++count;
01723     else
01724         break;
01725     }
01726 return count;
01727 }
01728 
01729 static int countNonInsert(char *s, int size)
01730 /* Count number of characters in initial part of s that
01731  * are not '-'. */
01732 {
01733 int count = 0;
01734 int i;
01735 for (i=0; i<size; ++i)
01736     if (*s++ != '-')
01737         ++count;
01738 return count;
01739 }
01740 
01741 static void trimAlignment(struct psl* psl, char** qStringPtr, char** tStringPtr,
01742                           int* aliSizePtr)
01743 /* remove leading or trailing indels from alignment */
01744 {
01745 char* qString = *qStringPtr;
01746 char* tString = *tStringPtr;
01747 int aliSize = *aliSizePtr;
01748 int qStartInsert = countInitialChars(qString, '-');
01749 int tStartInsert = countInitialChars(tString, '-');
01750 int qEndInsert = countTerminalChars(qString, '-');
01751 int tEndInsert = countTerminalChars(tString, '-');
01752 int startInsert = max(qStartInsert, tStartInsert);
01753 int endInsert = max(qEndInsert, tEndInsert);
01754 int qNonCount, tNonCount;
01755 
01756 if (startInsert > 0)
01757     {
01758     qNonCount = countNonInsert(qString, startInsert);
01759     tNonCount = countNonInsert(tString, startInsert);
01760     qString += startInsert;
01761     tString += startInsert;
01762     aliSize -= startInsert;
01763     psl->qStart += qNonCount;
01764     psl->tStart += tNonCount;
01765     }
01766 if (endInsert > 0)
01767     {
01768     aliSize -= endInsert;
01769     qNonCount = countNonInsert(qString+aliSize, endInsert);
01770     tNonCount = countNonInsert(tString+aliSize, endInsert);
01771     qString[aliSize] = 0;
01772     tString[aliSize] = 0;
01773     psl->qEnd -= qNonCount;
01774     psl->tEnd -= tNonCount;
01775     }
01776 *qStringPtr = qString;
01777 *tStringPtr = tString;
01778 *aliSizePtr = aliSize;
01779 /* recursive call to handle double gap */
01780 if (startInsert > 0 || endInsert > 0)
01781     trimAlignment(psl, qStringPtr, tStringPtr, aliSizePtr);
01782 }
01783 
01784 static void addBlock(struct psl* psl, int qs, int qe, int ts, int te,
01785                      int *blockSpace)
01786 /* add a block to the psl, growing if necessary */
01787 {
01788 assert((qe-qs) == (te-ts));  /* same lengths? */
01789 if (psl->blockCount == *blockSpace)
01790     pslGrow(psl, blockSpace);
01791 psl->blockSizes[psl->blockCount] = qe - qs;
01792 psl->qStarts[psl->blockCount] = qs;
01793 psl->tStarts[psl->blockCount] = ts;
01794 psl->blockCount++;
01795 }
01796 
01797 static void accumCounts(struct psl *psl, char prevQ, char prevT,
01798                         char q, char t, unsigned options)
01799 /* accumulate block and base counts  */
01800 {
01801 if ((q != '-') && (t != '-'))
01802     {
01803     /* aligned column. */
01804     char qu = toupper(q);
01805     char tu = toupper(t);
01806     if ((q == 'N') || (t == 'N'))
01807         psl->nCount++;
01808     else if (qu == tu)
01809         {
01810         if ((options & PSL_IS_SOFTMASK) && ((qu != q) || (tu != t)))
01811             psl->repMatch++;
01812         else
01813             psl->match++;
01814         }
01815     else
01816         psl->misMatch++;
01817     }
01818 else if ((q == '-') && (t != '-'))
01819     {
01820     /* target insert */
01821     psl->tBaseInsert++;
01822     if (prevQ != '-')
01823         psl->tNumInsert++;
01824     }
01825 else if ((t == '-') && (q != '-'))
01826     {
01827     /* query insert */
01828     psl->qBaseInsert++;
01829     if (prevT != '-')
01830         psl->qNumInsert++;
01831     }
01832 }
01833 
01834 struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
01835                          char *tName, int tSize, int tStart, int tEnd, char *tString,
01836                          char* strand, unsigned options)
01837 /* Create a PSL from an alignment.  Options PSL_IS_SOFTMASK if lower case
01838  * bases indicate repeat masking.  Returns NULL if alignment is empty after
01839  * triming leading and trailing indels.*/
01840 {
01841 /* Note, the unit tests for these programs exercise this function:
01842  *   hg/embossToPsl
01843  *   hg/mouseStuff/axtToPsl
01844  *   hg/spideyToPsl
01845  *   utils/est2genomeToPsl
01846  */
01847 
01848 int blockSpace = 16;
01849 struct psl* psl = NULL;
01850 int aliSize = strlen(qString);
01851 boolean eitherInsert = FALSE;   /* True if either in insert state. */
01852 int i, qs,qe,ts,te;
01853 char prevQ = '\0',  prevT = '\0';
01854 AllocVar(psl);
01855 
01856 if (strlen(tString) != aliSize)
01857     errAbort("query and target alignment strings are different lengths");
01858 
01859 psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd,
01860              strand, blockSpace, 0);
01861 trimAlignment(psl, &qString, &tString, &aliSize);
01862 
01863 /* Don't create if either query or target is zero length after trim */
01864  if ((psl->qStart == psl->qEnd) || (psl->tStart == psl->tEnd))
01865      {
01866      pslFree(&psl);
01867      return NULL;
01868      }
01869 
01870 /* Get range of alignment in strand-specified coordinates */
01871 qs = psl->qStart;
01872 qe = psl->qEnd;
01873 if (strand[0] == '-')
01874     reverseIntRange(&qs, &qe, psl->qSize);
01875 ts = psl->tStart;
01876 te = psl->tEnd;
01877 if (strand[1] == '-')
01878     reverseIntRange(&ts, &te, psl->tSize);
01879 
01880 eitherInsert = FALSE;
01881 qe = qs;  /* current block coords */
01882 te = ts;
01883 for (i=0; i<aliSize; ++i)
01884     {
01885     char q = qString[i];
01886     char t = tString[i];
01887     if ((q == '-') && (t == '-'))
01888         {
01889         continue; /* nothing in this column, just ignore it */
01890         }
01891     else if ((q == '-') || (t == '-'))
01892         {
01893         /* insert in one of the columns */
01894         if (!eitherInsert)
01895             {
01896             /* end of a block */
01897             addBlock(psl, qs, qe, ts, te, &blockSpace);
01898             eitherInsert = TRUE;
01899             }
01900         if (q != '-')
01901             qe += 1;
01902         if (t != '-')
01903             te += 1;
01904         }
01905     else
01906         {
01907         /* columns aligned */
01908         if (eitherInsert)
01909             {
01910             /* start new block */
01911             qs = qe;
01912             ts = te;
01913             eitherInsert = FALSE;
01914             }
01915         qe += 1;
01916         te += 1;
01917         }
01918     accumCounts(psl, prevQ, prevT, q, t, options);
01919     prevQ = q; /* will not include skipped empty columns */
01920     prevT = t;
01921     }
01922 addBlock(psl, qs, qe, ts, te, &blockSpace);
01923 return psl;
01924 }
01925 
01926 struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
01927                    char *tName, unsigned tSize, int tStart, int tEnd,
01928                    char *strand, unsigned blockSpace, unsigned opts)
01929 /* create a new psl with space for the specified number of blocks allocated.
01930  * pslGrow maybe used to expand this space if needed.  Valid options are
01931  * PSL_XA_FORMAT. */
01932 {
01933 struct psl *psl;
01934 AllocVar(psl);
01935 assert(blockSpace > 0);
01936 psl->qName = cloneString(qName);
01937 psl->qSize = qSize;
01938 psl->qStart = qStart;
01939 psl->qEnd = qEnd;
01940 psl->tName = cloneString(tName);
01941 psl->tSize = tSize;
01942 psl->tStart = tStart;
01943 psl->tEnd = tEnd;
01944 strncpy(psl->strand, strand, 2);
01945 AllocArray(psl->blockSizes, blockSpace);
01946 AllocArray(psl->qStarts, blockSpace);
01947 AllocArray(psl->tStarts, blockSpace);
01948 if (opts & PSL_XA_FORMAT)
01949     {
01950     AllocArray(psl->qSequence, blockSpace);
01951     AllocArray(psl->tSequence, blockSpace);
01952     }
01953 return psl;
01954 }
01955 
01956 void pslGrow(struct psl *psl, int *blockSpacePtr)
01957 /* Increase memory allocated to a psl to hold more blocks.  blockSpacePtr
01958  * should point the the current maximum number of blocks and will be
01959  * updated to with the new amount of space. */
01960 {
01961 int blockSpace = *blockSpacePtr;
01962 int newSpace = 2 * blockSpace;
01963 ExpandArray(psl->blockSizes, blockSpace, newSpace);
01964 ExpandArray(psl->qStarts, blockSpace, newSpace);
01965 ExpandArray(psl->tStarts, blockSpace, newSpace);
01966 if (psl->qSequence != NULL)
01967     {
01968     ExpandArray(psl->qSequence, blockSpace, newSpace);
01969     ExpandArray(psl->tSequence, blockSpace, newSpace);
01970     }
01971 *blockSpacePtr = newSpace;
01972 }
01973 
01974 int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree)
01975 /* Return amount that psl overlaps (on target side) with rangeTree. */
01976 {
01977 int i;
01978 int overlap = 0;
01979 boolean isRc = (psl->strand[1] == '-');
01980 for (i=0; i<psl->blockCount; ++i)
01981     {
01982     int start = psl->tStarts[i];
01983     int end = start + psl->blockSizes[i];
01984     if (isRc)
01985         reverseIntRange(&start, &end, psl->tSize);
01986     overlap += rangeTreeOverlapSize(rangeTree, start, end);
01987     }
01988 return overlap;
01989 }
01990 

Generated on Tue Dec 25 18:39:31 2007 for blat by  doxygen 1.5.2