00001
00002
00003
00004
00005
00006
00007
00008 #include "common.h"
00009 #include "sqlNum.h"
00010 #include "sqlList.h"
00011 #include "localmem.h"
00012 #include "psl.h"
00013 #include "hash.h"
00014 #include "linefile.h"
00015 #include "dnaseq.h"
00016 #include "dystring.h"
00017 #include "fuzzyFind.h"
00018 #include "aliType.h"
00019 #include "binRange.h"
00020 #include "rangeTree.h"
00021
00022 static char const rcsid[] = "$Id: psl.c,v 1.76 2007/03/04 20:24:19 kent Exp $";
00023
00024 static char *createString =
00025 "CREATE TABLE %s (\n"
00026 "%s"
00027 "matches int unsigned not null, # Number of bases that match that aren't repeats\n"
00028 "misMatches int unsigned not null, # Number of bases that don't match\n"
00029 "repMatches int unsigned not null, # Number of bases that match but are part of repeats\n"
00030 "nCount int unsigned not null, # Number of 'N' bases\n"
00031 "qNumInsert int unsigned not null, # Number of inserts in query\n"
00032 "qBaseInsert int unsigned not null, # Number of bases inserted in query\n"
00033 "tNumInsert int unsigned not null, # Number of inserts in target\n"
00034 "tBaseInsert int unsigned not null, # Number of bases inserted in target\n"
00035 "strand char(2) not null, # + or - for strand. First character is query, second is target.\n"
00036 "qName varchar(255) not null, # Query sequence name\n"
00037 "qSize int unsigned not null, # Query sequence size\n"
00038 "qStart int unsigned not null, # Alignment start position in query\n"
00039 "qEnd int unsigned not null, # Alignment end position in query\n"
00040 "tName varchar(255) not null, # Target sequence name\n"
00041 "tSize int unsigned not null, # Target sequence size\n"
00042 "tStart int unsigned not null, # Alignment start position in target\n"
00043 "tEnd int unsigned not null, # Alignment end position in target\n"
00044 "blockCount int unsigned not null, # Number of blocks in alignment\n"
00045 "blockSizes longblob not null, # Size of each block\n"
00046 "qStarts longblob not null, # Start of each block in query.\n"
00047 "tStarts longblob not null, # Start of each block in target.\n";
00048
00049 static char *indexString =
00050 "#Indices\n"
00051 "%s"
00052 "INDEX(qName(12))\n"
00053 ")\n";
00054
00055
00056 struct psl *pslxLoad(char **row)
00057
00058
00059 {
00060 struct psl *ret = pslLoad(row);
00061 int retSize;
00062 sqlStringDynamicArray(row[21],&ret->qSequence, &retSize);
00063 sqlStringDynamicArray(row[22],&ret->tSequence, &retSize);
00064 return ret;
00065 }
00066
00067 struct psl *pslLoad(char **row)
00068
00069
00070 {
00071 struct psl *ret;
00072 int sizeOne;
00073
00074 AllocVar(ret);
00075 ret->blockCount = sqlUnsigned(row[17]);
00076 ret->match = sqlUnsigned(row[0]);
00077 ret->misMatch = sqlUnsigned(row[1]);
00078 ret->repMatch = sqlUnsigned(row[2]);
00079 ret->nCount = sqlUnsigned(row[3]);
00080 ret->qNumInsert = sqlUnsigned(row[4]);
00081 ret->qBaseInsert = sqlSigned(row[5]);
00082 ret->tNumInsert = sqlUnsigned(row[6]);
00083 ret->tBaseInsert = sqlSigned(row[7]);
00084 strcpy(ret->strand, row[8]);
00085 ret->qName = cloneString(row[9]);
00086 ret->qSize = sqlUnsigned(row[10]);
00087 ret->qStart = sqlUnsigned(row[11]);
00088 ret->qEnd = sqlUnsigned(row[12]);
00089 ret->tName = cloneString(row[13]);
00090 ret->tSize = sqlUnsigned(row[14]);
00091 ret->tStart = sqlUnsigned(row[15]);
00092 ret->tEnd = sqlUnsigned(row[16]);
00093 sqlUnsignedDynamicArray(row[18], &ret->blockSizes, &sizeOne);
00094 if (sizeOne != ret->blockCount)
00095 {
00096 printf("sizeOne bloxksizes %d bs %d block=%s\n",sizeOne, ret->blockCount,row[18]);
00097 }
00098 assert(sizeOne == ret->blockCount);
00099 sqlUnsignedDynamicArray(row[19], &ret->qStarts, &sizeOne);
00100 if (sizeOne != ret->blockCount)
00101 {
00102 printf("sizeOne qStarts %d bs %d\n",sizeOne, ret->blockCount);
00103 }
00104 assert(sizeOne == ret->blockCount);
00105 sqlUnsignedDynamicArray(row[20], &ret->tStarts, &sizeOne);
00106 if (sizeOne != ret->blockCount)
00107 {
00108 printf("sizeOne tStarts %d bs %d\n",sizeOne, ret->blockCount);
00109 }
00110 assert(sizeOne == ret->blockCount);
00111 return ret;
00112 }
00113
00114 struct psl *pslCommaIn(char **pS, struct psl *ret)
00115
00116
00117
00118 {
00119 char *s = *pS;
00120 int i;
00121
00122 if (ret == NULL)
00123 AllocVar(ret);
00124 ret->match = sqlUnsignedComma(&s);
00125 ret->misMatch = sqlUnsignedComma(&s);
00126 ret->repMatch = sqlUnsignedComma(&s);
00127 ret->nCount = sqlUnsignedComma(&s);
00128 ret->qNumInsert = sqlUnsignedComma(&s);
00129 ret->qBaseInsert = sqlSignedComma(&s);
00130 ret->tNumInsert = sqlUnsignedComma(&s);
00131 ret->tBaseInsert = sqlSignedComma(&s);
00132 sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
00133 ret->qName = sqlStringComma(&s);
00134 ret->qSize = sqlUnsignedComma(&s);
00135 ret->qStart = sqlUnsignedComma(&s);
00136 ret->qEnd = sqlUnsignedComma(&s);
00137 ret->tName = sqlStringComma(&s);
00138 ret->tSize = sqlUnsignedComma(&s);
00139 ret->tStart = sqlUnsignedComma(&s);
00140 ret->tEnd = sqlUnsignedComma(&s);
00141 ret->blockCount = sqlUnsignedComma(&s);
00142 s = sqlEatChar(s, '{');
00143 AllocArray(ret->blockSizes, ret->blockCount);
00144 for (i=0; i<ret->blockCount; ++i)
00145 {
00146 ret->blockSizes[i] = sqlUnsignedComma(&s);
00147 }
00148 s = sqlEatChar(s, '}');
00149 s = sqlEatChar(s, ',');
00150 s = sqlEatChar(s, '{');
00151 AllocArray(ret->qStarts, ret->blockCount);
00152 for (i=0; i<ret->blockCount; ++i)
00153 {
00154 ret->qStarts[i] = sqlUnsignedComma(&s);
00155 }
00156 s = sqlEatChar(s, '}');
00157 s = sqlEatChar(s, ',');
00158 s = sqlEatChar(s, '{');
00159 AllocArray(ret->tStarts, ret->blockCount);
00160 for (i=0; i<ret->blockCount; ++i)
00161 {
00162 ret->tStarts[i] = sqlUnsignedComma(&s);
00163 }
00164 s = sqlEatChar(s, '}');
00165 s = sqlEatChar(s, ',');
00166 *pS = s;
00167 return ret;
00168 }
00169
00170 void pslFree(struct psl **pEl)
00171
00172
00173 {
00174 struct psl *el;
00175
00176 if ((el = *pEl) == NULL) return;
00177 freeMem(el->qName);
00178 freeMem(el->tName);
00179 freeMem(el->blockSizes);
00180 freeMem(el->qStarts);
00181 freeMem(el->tStarts);
00182 if (el->qSequence)
00183 {
00184 freeMem(el->qSequence[0]);
00185 freeMem(el->qSequence);
00186 }
00187 if (el->tSequence)
00188 {
00189 freeMem(el->tSequence[0]);
00190 freeMem(el->tSequence);
00191 }
00192 freez(pEl);
00193 }
00194
00195 void pslFreeList(struct psl **pList)
00196
00197 {
00198 struct psl *el, *next;
00199
00200 for (el = *pList; el != NULL; el = next)
00201 {
00202 next = el->next;
00203 pslFree(&el);
00204 }
00205 *pList = NULL;
00206 }
00207
00208 void pslOutput(struct psl *el, FILE *f, char sep, char lastSep)
00209
00210 {
00211 int i;
00212 fprintf(f, "%u", el->match);
00213 fputc(sep,f);
00214 fprintf(f, "%u", el->misMatch);
00215 fputc(sep,f);
00216 fprintf(f, "%u", el->repMatch);
00217 fputc(sep,f);
00218 fprintf(f, "%u", el->nCount);
00219 fputc(sep,f);
00220 fprintf(f, "%u", el->qNumInsert);
00221 fputc(sep,f);
00222 fprintf(f, "%d", el->qBaseInsert);
00223 fputc(sep,f);
00224 fprintf(f, "%u", el->tNumInsert);
00225 fputc(sep,f);
00226 fprintf(f, "%d", el->tBaseInsert);
00227 fputc(sep,f);
00228 if (sep == ',') fputc('"',f);
00229 fprintf(f, "%s", el->strand);
00230 if (sep == ',') fputc('"',f);
00231 fputc(sep,f);
00232 if (sep == ',') fputc('"',f);
00233 fprintf(f, "%s", el->qName);
00234 if (sep == ',') fputc('"',f);
00235 fputc(sep,f);
00236 fprintf(f, "%u", el->qSize);
00237 fputc(sep,f);
00238 fprintf(f, "%u", el->qStart);
00239 fputc(sep,f);
00240 fprintf(f, "%u", el->qEnd);
00241 fputc(sep,f);
00242 if (sep == ',') fputc('"',f);
00243 fprintf(f, "%s", el->tName);
00244 if (sep == ',') fputc('"',f);
00245 fputc(sep,f);
00246 fprintf(f, "%u", el->tSize);
00247 fputc(sep,f);
00248 fprintf(f, "%u", el->tStart);
00249 fputc(sep,f);
00250 fprintf(f, "%u", el->tEnd);
00251 fputc(sep,f);
00252 fprintf(f, "%u", el->blockCount);
00253 fputc(sep,f);
00254 if (sep == ',') fputc('{',f);
00255 for (i=0; i<el->blockCount; ++i)
00256 {
00257 fprintf(f, "%u", el->blockSizes[i]);
00258 fputc(',', f);
00259 }
00260 if (sep == ',') fputc('}',f);
00261 fputc(sep,f);
00262 if (sep == ',') fputc('{',f);
00263 for (i=0; i<el->blockCount; ++i)
00264 {
00265 fprintf(f, "%u", el->qStarts[i]);
00266 fputc(',', f);
00267 }
00268 if (sep == ',') fputc('}',f);
00269 fputc(sep,f);
00270 if (sep == ',') fputc('{',f);
00271 for (i=0; i<el->blockCount; ++i)
00272 {
00273 fprintf(f, "%u", el->tStarts[i]);
00274 fputc(',', f);
00275 }
00276 if (sep == ',') fputc('}',f);
00277 if (el->qSequence)
00278 {
00279 fputc(sep,f);
00280 if (sep == ',') fputc('{',f);
00281 for (i=0; i<el->blockCount; ++i)
00282 {
00283 fprintf(f, "%s", el->qSequence[i]);
00284 fputc(',', f);
00285 }
00286 if (sep == ',') fputc('}',f);
00287 fputc(sep,f);
00288 if (sep == ',') fputc('{',f);
00289 for (i=0; i<el->blockCount; ++i)
00290 {
00291 fprintf(f, "%s", el->tSequence[i]);
00292 fputc(',', f);
00293 }
00294 if (sep == ',') fputc('}',f);
00295 }
00296
00297 fputc(lastSep,f);
00298 if (ferror(f))
00299 {
00300 perror("Error writing psl file\n");
00301 errAbort("\n");
00302 }
00303 }
00304
00305
00306
00307 void pslOutputShort(struct psl *el, FILE *f, char sep, char lastSep)
00308
00309 {
00310 fprintf(f, "%u", el->match);
00311 fputc(sep,f);
00312 fprintf(f, "%u", el->misMatch);
00313 fputc(sep,f);
00314 fprintf(f, "%u", el->repMatch);
00315 fputc(sep,f);
00316 fprintf(f, "%u", el->qNumInsert);
00317 fputc(sep,f);
00318 fprintf(f, "%d", el->qBaseInsert);
00319 fputc(sep,f);
00320 fprintf(f, "%u", el->tNumInsert);
00321 fputc(sep,f);
00322 fprintf(f, "%d", el->tBaseInsert);
00323 fputc(sep,f);
00324 if (sep == ',') fputc('"',f);
00325 fprintf(f, "%s", el->strand);
00326 if (sep == ',') fputc('"',f);
00327 fputc(sep,f);
00328 if (sep == ',') fputc('"',f);
00329 fprintf(f, "%s", el->qName);
00330 if (sep == ',') fputc('"',f);
00331 fputc(sep,f);
00332 fprintf(f, "%u", el->qStart);
00333 fputc(sep,f);
00334 fprintf(f, "%u", abs(el->qEnd - el->qStart));
00335 fputc(sep,f);
00336 if (sep == ',') fputc('"',f);
00337 fprintf(f, "%s", el->tName);
00338 if (sep == ',') fputc('"',f);
00339 fputc(sep,f);
00340 fprintf(f, "%u", el->tStart);
00341 fputc(sep,f);
00342 fprintf(f, "%u", abs(el->tEnd - el->tStart));
00343 fputc(sep,f);
00344 fprintf(f, "%u", el->blockCount);
00345 fputc(sep,f);
00346 if (sep == ',') fputc('{',f);
00347 fputc(lastSep,f);
00348 if (ferror(f))
00349 {
00350 perror("Error writing psl file\n");
00351 errAbort("\n");
00352 }
00353 }
00354
00355 void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep)
00356
00357
00358
00359
00360 {
00361 const char *headers[] = {"Matches", "Mismatches", "Matches in repeats", "Number of N bases", "Query name", "Size", "Start", "End", "Chromosome", "Strand", "Start", "End"};
00362 char *hformat = "<B>%s:</B> ";
00363 char *uformat = "<B>%s:</B> %u%c";
00364 char *targName;
00365
00366 fprintf(f, uformat, headers[0], el->match, sep);
00367 fprintf(f, uformat, headers[1], el->misMatch, sep);
00368 fprintf(f, uformat, headers[2], el->repMatch, sep);
00369 fprintf(f, uformat, headers[3], el->nCount, sep);
00370
00371 fprintf(f, hformat, headers[4]);
00372 if (sep == ',') fputc('"',f);
00373 fprintf(f, "%s", el->qName);
00374 if (sep == ',') fputc('"',f);
00375 fputc(sep,f);
00376
00377 fprintf(f, uformat, headers[5], el->qSize, sep);
00378 fprintf(f, uformat, headers[6], el->qStart, sep);
00379 fprintf(f, uformat, headers[7], el->qEnd, sep);
00380
00381 fprintf(f, hformat, headers[8]);
00382 if (sep == ',') fputc('"',f);
00383
00384 targName = el->tName;
00385 if (startsWith("chr", el->tName))
00386 targName += 3;
00387 fprintf(f, "%s", targName);
00388
00389 if (sep == ',') fputc('"',f);
00390 fputc(sep,f);
00391
00392 fprintf(f, hformat, headers[9]);
00393 if (sep == ',') fputc('"',f);
00394 fprintf(f, "%s", el->strand);
00395 if (sep == ',') fputc('"',f);
00396 fputc(sep,f);
00397
00398 fprintf(f, uformat, headers[10], el->tStart, sep);
00399 fprintf(f, uformat, headers[11], el->tEnd, sep);
00400
00401 fputc(lastSep,f);
00402
00403 if (ferror(f))
00404 {
00405 perror("Error writing psl file\n");
00406 errAbort("\n");
00407 }
00408
00409 }
00410
00411 struct psl *pslLoadAll(char *fileName)
00412
00413 {
00414 struct lineFile *lf = pslFileOpen(fileName);
00415 struct psl *pslList = NULL, *psl;
00416 while ((psl = pslNext(lf)) != NULL)
00417 {
00418 slAddHead(&pslList, psl);
00419 }
00420 slReverse(&pslList);
00421 lineFileClose(&lf);
00422 return pslList;
00423 }
00424
00425
00426 int pslCmpQuery(const void *va, const void *vb)
00427
00428 {
00429 const struct psl *a = *((struct psl **)va);
00430 const struct psl *b = *((struct psl **)vb);
00431 int dif;
00432 dif = strcmp(a->qName, b->qName);
00433 if (dif == 0)
00434 dif = a->qStart - b->qStart;
00435 return dif;
00436 }
00437
00438 int pslCmpTarget(const void *va, const void *vb)
00439
00440 {
00441 const struct psl *a = *((struct psl **)va);
00442 const struct psl *b = *((struct psl **)vb);
00443 int dif;
00444 dif = strcmp(a->tName, b->tName);
00445 if (dif == 0)
00446 dif = a->tStart - b->tStart;
00447 return dif;
00448 }
00449
00450 int pslCmpTargetAndStrand(const void *va, const void *vb)
00451
00452 {
00453 const struct psl *a = *((struct psl **)va);
00454 const struct psl *b = *((struct psl **)vb);
00455 int dif;
00456 dif = strcmp(a->tName, b->tName);
00457 if (dif == 0)
00458 dif = strcmp(a->strand, b->strand);
00459 if (dif == 0)
00460 dif = a->tStart - b->tStart;
00461 return dif;
00462 }
00463
00464
00465 int pslCmpScore(const void *va, const void *vb)
00466
00467 {
00468 const struct psl *a = *((struct psl **)va);
00469 const struct psl *b = *((struct psl **)vb);
00470 return pslScore(b) - pslScore(a);
00471 }
00472
00473 int pslCmpQueryScore(const void *va, const void *vb)
00474
00475 {
00476 const struct psl *a = *((struct psl **)va);
00477 const struct psl *b = *((struct psl **)vb);
00478 int diff = strcmp(a->qName, b->qName);
00479 if (diff == 0)
00480 diff = pslScore(b) - pslScore(a);
00481 return diff;
00482 }
00483
00484 int pslCmpMatch(const void *va, const void *vb)
00485
00486 {
00487 const struct psl *a = *((struct psl **)va);
00488 const struct psl *b = *((struct psl **)vb);
00489 return b->match - a->match;
00490 }
00491
00492 static void pslLabelColumns(FILE *f)
00493
00494 {
00495 fputs("\n"
00496 "match\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ \tQ \tQ \tQ \tT \tT \tT \tT \tblock\tblockSizes \tqStarts\t tStarts\n"
00497 " \tmatch\tmatch\t \tcount\tbases\tcount\tbases\t \tname \tsize\tstart\tend\tname \tsize\tstart\tend\tcount\n"
00498 "---------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
00499 f);
00500 }
00501
00502 void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType)
00503
00504 {
00505 fprintf(f, "psLayout version 4 %s %s\n", gfTypeName(qType), gfTypeName(tType));
00506 pslLabelColumns(f);
00507 }
00508
00509 void pslWriteHead(FILE *f)
00510
00511 {
00512 fputs("psLayout version 3\n", f);
00513 pslLabelColumns(f);
00514 }
00515
00516 void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader)
00517
00518 {
00519 FILE *f;
00520 struct psl *psl;
00521
00522 f = mustOpen(fileName, "w");
00523 if (writeHeader)
00524 pslWriteHead(f);
00525 for (psl = pslList; psl != NULL; psl = psl->next)
00526 pslTabOut(psl, f);
00527 fclose(f);
00528 }
00529
00530 void pslxFileOpen(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf)
00531
00532
00533 {
00534 char *line;
00535 int lineSize;
00536 char *words[30];
00537 char *version;
00538 int wordCount;
00539 int i;
00540 enum gfType qt = gftRna, tt = gftDna;
00541 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00542
00543 if (!lineFileNext(lf, &line, &lineSize))
00544 warn("%s is empty", fileName);
00545 else
00546 {
00547 if (startsWith("psLayout version", line))
00548 {
00549 wordCount = chopLine(line, words);
00550 if (wordCount < 3)
00551 errAbort("%s is not a psLayout file", fileName);
00552 version = words[2];
00553 if (sameString(version, "3"))
00554 {
00555 }
00556 else if (sameString(version, "4"))
00557 {
00558 qt = gfTypeFromName(words[3]);
00559 tt = gfTypeFromName(words[4]);
00560 }
00561 else
00562 {
00563 errAbort("%s is version %s of psLayout, this program can only handle through version 4",
00564 fileName, version);
00565 }
00566 for (i=0; i<4; ++i)
00567 {
00568 if (!lineFileNext(lf, &line, &lineSize))
00569 errAbort("%s severely truncated", fileName);
00570 }
00571 }
00572 else
00573 {
00574 char *s = cloneString(line);
00575 while (line != NULL && line[0] == '#')
00576 {
00577 freeMem(s);
00578 lineFileNext(lf, &line, &lineSize);
00579 s = cloneString(line);
00580 }
00581 wordCount = chopLine(s, words);
00582 if (wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-'))
00583 errAbort("%s is not a psLayout file", fileName);
00584 else
00585 lineFileReuse(lf);
00586 freeMem(s);
00587 }
00588 }
00589 *retQueryType = qt;
00590 *retTargetType = tt;
00591 *retLf = lf;
00592 }
00593
00594 static void pslxFileOpenWithMetaConfig(char *fileName, bool isMetaUnique, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00595
00596
00597 {
00598 char *line;
00599 int lineSize;
00600 char *words[30];
00601 char *version;
00602 int wordCount;
00603 int i;
00604 enum gfType qt = gftRna, tt = gftDna;
00605 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00606
00607 lineFileSetMetaDataOutput(lf, f);
00608 if (isMetaUnique)
00609 lineFileSetUniqueMetaData(lf);
00610 if (!lineFileNext(lf, &line, &lineSize))
00611 warn("%s is empty", fileName);
00612 else
00613 {
00614 if (startsWith("psLayout version", line))
00615 {
00616 wordCount = chopLine(line, words);
00617 if (wordCount < 3)
00618 errAbort("%s is not a psLayout file", fileName);
00619 version = words[2];
00620 if (sameString(version, "3"))
00621 {
00622 }
00623 else if (sameString(version, "4"))
00624 {
00625 qt = gfTypeFromName(words[3]);
00626 tt = gfTypeFromName(words[4]);
00627 }
00628 else
00629 {
00630 errAbort("%s is version %s of psLayout, this program can only handle through version 4",
00631 fileName, version);
00632 }
00633 for (i=0; i<4; ++i)
00634 {
00635 if (!lineFileNext(lf, &line, &lineSize))
00636 errAbort("%s severely truncated", fileName);
00637 }
00638 }
00639 else
00640 {
00641 char *s = cloneString(line);
00642 boolean eof = FALSE;
00643 while ((line[0] == '#') && (!eof))
00644 {
00645 freeMem(s);
00646 if (!lineFileNext(lf, &line, &lineSize))
00647 eof = TRUE;
00648 s = cloneString(line);
00649 }
00650 wordCount = chopLine(s, words);
00651 if ((wordCount < 21 || wordCount > 23 || (words[8][0] != '+' && words[8][0] != '-')) && (!eof))
00652 errAbort("%s is not a psLayout file", fileName);
00653 else
00654 lineFileReuse(lf);
00655 freeMem(s);
00656 }
00657 }
00658 *retQueryType = qt;
00659 *retTargetType = tt;
00660 *retLf = lf;
00661 }
00662
00663 void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00664
00665
00666 {
00667 pslxFileOpenWithMetaConfig(fileName, FALSE, retQueryType, retTargetType, retLf, f);
00668 }
00669
00670 void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f)
00671
00672
00673 {
00674 pslxFileOpenWithMetaConfig(fileName, TRUE, retQueryType, retTargetType, retLf, f);
00675 }
00676
00677 struct lineFile *pslFileOpen(char *fileName)
00678
00679
00680 {
00681 enum gfType qt, tt;
00682 struct lineFile *lf;
00683 pslxFileOpen(fileName, &qt, &tt, &lf);
00684 return lf;
00685 }
00686
00687 struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f)
00688
00689
00690 {
00691 enum gfType qt, tt;
00692 struct lineFile *lf;
00693 pslxFileOpenWithMeta(fileName, &qt, &tt, &lf, f);
00694 return lf;
00695 }
00696
00697 struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f)
00698
00699
00700
00701 {
00702 enum gfType qt, tt;
00703 struct lineFile *lf;
00704 pslxFileOpenWithUniqueMeta(fileName, &qt, &tt, &lf, f);
00705 return lf;
00706 }
00707
00708 struct psl *pslNext(struct lineFile *lf)
00709
00710
00711 {
00712 char *line;
00713 int lineSize;
00714 char *words[32];
00715 int wordCount;
00716 static int lineAlloc = 0;
00717 static char *chopBuf = NULL;
00718
00719 if (!lineFileNextReal(lf, &line))
00720 {
00721 return NULL;
00722 }
00723 lineSize = strlen(line);
00724 if (lineSize >= lineAlloc)
00725 {
00726 lineAlloc = lineSize+256;
00727 chopBuf = needMoreMem(chopBuf, 0, lineAlloc);
00728 }
00729 memcpy(chopBuf, line, lineSize+1);
00730 wordCount = chopLine(chopBuf, words);
00731 if (wordCount == 21)
00732 {
00733 return pslLoad(words);
00734 }
00735 if (wordCount == 23)
00736 {
00737 return pslxLoad(words);
00738 }
00739 else
00740 {
00741 errAbort("Bad line %d of %s wordCount is %d instead of 21 or 23\n", lf->lineIx, lf->fileName, wordCount);
00742 return NULL;
00743 }
00744 }
00745
00746 struct psl *pslxLoadLm(char **row, struct lm *lm)
00747
00748 {
00749 struct psl *ret = pslLoadLm(row, lm);
00750 ret->qSequence = lmAlloc(lm, sizeof(ret->qSequence[0]) * ret->blockCount);
00751 sqlStringArray(lmCloneString(lm,row[21]),ret->qSequence, ret->blockCount);
00752 ret->tSequence = lmAlloc(lm, sizeof(ret->tSequence[0]) * ret->blockCount);
00753 sqlStringArray(lmCloneString(lm,row[22]),ret->tSequence, ret->blockCount);
00754 return ret;
00755 }
00756
00757 struct psl *pslLoadLm(char **row, struct lm *lm)
00758
00759 {
00760 struct psl *ret;
00761
00762 ret = lmAlloc(lm, sizeof(*ret));
00763 ret->blockCount = sqlUnsigned(row[17]);
00764 ret->match = sqlUnsigned(row[0]);
00765 ret->misMatch = sqlUnsigned(row[1]);
00766 ret->repMatch = sqlUnsigned(row[2]);
00767 ret->nCount = sqlUnsigned(row[3]);
00768 ret->qNumInsert = sqlUnsigned(row[4]);
00769 ret->qBaseInsert = sqlSigned(row[5]);
00770 ret->tNumInsert = sqlUnsigned(row[6]);
00771 ret->tBaseInsert = sqlSigned(row[7]);
00772 strcpy(ret->strand, row[8]);
00773 ret->qName = lmCloneString(lm,row[9]);
00774 ret->qSize = sqlUnsigned(row[10]);
00775 ret->qStart = sqlUnsigned(row[11]);
00776 ret->qEnd = sqlUnsigned(row[12]);
00777 ret->tName = lmCloneString(lm, row[13]);
00778 ret->tSize = sqlUnsigned(row[14]);
00779 ret->tStart = sqlUnsigned(row[15]);
00780 ret->tEnd = sqlUnsigned(row[16]);
00781 ret->blockSizes = lmAlloc(lm, sizeof(ret->blockSizes[0]) * ret->blockCount);
00782 sqlUnsignedArray(row[18], ret->blockSizes, ret->blockCount);
00783 ret->qStarts = lmAlloc(lm, sizeof(ret->qStarts[0]) * ret->blockCount);
00784 sqlUnsignedArray(row[19], ret->qStarts, ret->blockCount);
00785 ret->tStarts = lmAlloc(lm, sizeof(ret->tStarts[0]) * ret->blockCount);
00786 sqlUnsignedArray(row[20], ret->tStarts, ret->blockCount);
00787 return ret;
00788 }
00789
00790 boolean pslIsProtein(const struct psl *psl)
00791
00792 {
00793 int lastBlock = psl->blockCount - 1;
00794
00795 return (((psl->strand[1] == '+' ) &&
00796 (psl->tEnd == psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])) ||
00797 ((psl->strand[1] == '-') &&
00798 (psl->tStart == (psl->tSize-(psl->tStarts[lastBlock] + 3*psl->blockSizes[lastBlock])))));
00799 }
00800
00801 int pslCalcMilliBad(struct psl *psl, boolean isMrna)
00802
00803 {
00804 int sizeMul = pslIsProtein(psl) ? 3 : 1;
00805 int qAliSize, tAliSize, aliSize;
00806 int milliBad = 0;
00807 int sizeDif;
00808 int insertFactor;
00809 int total;
00810
00811 qAliSize = sizeMul * (psl->qEnd - psl->qStart);
00812 tAliSize = psl->tEnd - psl->tStart;
00813 aliSize = min(qAliSize, tAliSize);
00814 if (aliSize <= 0)
00815 return 0;
00816 sizeDif = qAliSize - tAliSize;
00817 if (sizeDif < 0)
00818 {
00819 if (isMrna)
00820 sizeDif = 0;
00821 else
00822 sizeDif = -sizeDif;
00823 }
00824 insertFactor = psl->qNumInsert;
00825 if (!isMrna)
00826 insertFactor += psl->tNumInsert;
00827
00828 total = (sizeMul * (psl->match + psl->repMatch + psl->misMatch));
00829 if (total != 0)
00830 milliBad = (1000 * (psl->misMatch*sizeMul + insertFactor + round(3*log(1+sizeDif)))) / total;
00831 return milliBad;
00832 }
00833
00834 int pslScore(const struct psl *psl)
00835
00836 {
00837 int sizeMul = pslIsProtein(psl) ? 3 : 1;
00838
00839 return sizeMul * (psl->match + ( psl->repMatch>>1)) -
00840 sizeMul * psl->misMatch - psl->qNumInsert - psl->tNumInsert;
00841 }
00842
00843 int pslCmpScoreDesc(const void *va, const void *vb)
00844
00845 {
00846 const struct psl *a = *((struct psl **)va);
00847 const struct psl *b = *((struct psl **)vb);
00848 return pslScore(b) - pslScore(a);
00849 }
00850
00851
00852 struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack)
00853
00854
00855
00856 {
00857 struct ffAli *ffList = NULL, *ff;
00858 int blockCount = psl->blockCount;
00859 unsigned *blockSizes = psl->blockSizes;
00860 unsigned *qStarts = psl->qStarts;
00861 unsigned *tStarts = psl->tStarts;
00862 int size;
00863 int i;
00864
00865 for (i=0; i<blockCount; ++i)
00866 {
00867 size = blockSizes[i];
00868 AllocVar(ff);
00869 ff->left = ffList;
00870 ffList = ff;
00871 ff->nStart = ff->nEnd = needle + qStarts[i];
00872 ff->nEnd += size;
00873 ff->hStart = ff->hEnd = haystack + tStarts[i];
00874 ff->hEnd += size;
00875 }
00876 ffList = ffMakeRightLinks(ffList);
00877 return ffList;
00878 }
00879
00880 struct psl *pslFromFakeFfAli(struct ffAli *ff,
00881 DNA *needle, DNA *haystack, char strand,
00882 char *qName, int qSize, char *tName, int tSize)
00883
00884
00885
00886 {
00887 struct psl *psl;
00888 unsigned *blockSizes;
00889 unsigned *qStarts;
00890 unsigned *tStarts;
00891 int blockCount;
00892 int i;
00893 int nStart, hStart;
00894 int nEnd, hEnd;
00895
00896 AllocVar(psl);
00897 psl->blockCount = blockCount = ffAliCount(ff);
00898 psl->blockSizes = AllocArray(blockSizes, blockCount);
00899 psl->qStarts = AllocArray(qStarts, blockCount);
00900 psl->tStarts = AllocArray(tStarts, blockCount);
00901 psl->qName = cloneString(qName);
00902 psl->qSize = qSize;
00903 psl->tName = cloneString(tName);
00904 psl->tSize = tSize;
00905 psl->strand[0] = strand;
00906
00907 for (i=0; i<blockCount; ++i)
00908 {
00909 nStart = ff->nStart - needle;
00910 nEnd = ff->nEnd - needle;
00911 hStart = ff->hStart - haystack;
00912 hEnd = ff->hEnd - haystack;
00913 blockSizes[i] = nEnd - nStart;
00914 qStarts[i] = nStart;
00915 tStarts[i] = hStart;
00916 if (i == 0)
00917 {
00918 psl->qStart = nStart;
00919 psl->tStart = hStart;
00920 }
00921 if (i == blockCount-1)
00922 {
00923 psl->qEnd = nEnd;
00924 psl->tEnd = hEnd;
00925 }
00926 ff = ff->right;
00927 }
00928 if (strand == '-')
00929 {
00930 reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
00931 }
00932 return psl;
00933 }
00934
00935 struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
00936 int targetOffset)
00937
00938
00939 {
00940 struct ffAli *ffList = NULL, *ff;
00941 DNA *needle = query->dna;
00942 DNA *haystack = target->dna;
00943 int blockCount = psl->blockCount;
00944 unsigned *blockSizes = psl->blockSizes;
00945 unsigned *qStarts = psl->qStarts;
00946 unsigned *tStarts = psl->tStarts;
00947 int size;
00948 int i;
00949 int tMin = targetOffset;
00950 int tMax = targetOffset + target->size;
00951 int tStart, tEnd;
00952 int clipStart, clipEnd, clipOffset, clipSize;
00953
00954 for (i=0; i<blockCount; ++i)
00955 {
00956 clipStart = tStart = tStarts[i];
00957 size = blockSizes[i];
00958 clipEnd = tEnd = tStart + size;
00959 if (tStart < tMax && tEnd > tMin)
00960 {
00961 if (clipStart < tMin) clipStart = tMin;
00962 if (clipEnd > tMax) clipEnd = tMax;
00963 clipOffset = clipStart - tStart;
00964 clipSize = clipEnd - clipStart;
00965 AllocVar(ff);
00966 ff->left = ffList;
00967 ffList = ff;
00968 ff->nStart = ff->nEnd = needle + qStarts[i] + clipOffset;
00969 ff->nEnd += clipSize;
00970 ff->hStart = ff->hEnd = haystack + clipStart - targetOffset;
00971 ff->hEnd += clipSize;
00972 }
00973 }
00974 ffList = ffMakeRightLinks(ffList);
00975 ffCountGoodEnds(ffList);
00976 return ffList;
00977 }
00978
00979 int pslOrientation(struct psl *psl)
00980
00981 {
00982 if (psl->strand[1] != '\0')
00983 {
00984
00985 if (psl->strand[0] != psl->strand[1])
00986 return -1;
00987 else
00988 return 1;
00989 }
00990 else
00991 {
00992 if (psl->strand[0] == '-')
00993 return -1;
00994 else
00995 return 1;
00996 }
00997 }
00998
00999 int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
01000
01001
01002
01003
01004
01005 {
01006 int intronDir = 0;
01007 int oneDir;
01008 int i;
01009 DNA *dna = genoSeq->dna;
01010
01011
01012 if (psl->strand[1] == '-')
01013 errAbort("pslWeightedIntronOrientation doesn't support a negative target strand");
01014
01015 for (i=1; i<psl->blockCount; ++i)
01016 {
01017 int iStart, iEnd, blockSize = psl->blockSizes[i-1];
01018 if (psl->qStarts[i-1] + blockSize == psl->qStarts[i])
01019 {
01020 iStart = psl->tStarts[i-1] + psl->blockSizes[i-1] - offset;
01021 iEnd = psl->tStarts[i] - offset;
01022 oneDir = intronOrientation(dna+iStart, dna+iEnd);
01023 intronDir += oneDir;
01024 }
01025 }
01026 return intronDir;
01027 }
01028
01029 int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset)
01030
01031
01032
01033
01034 {
01035 int intronDir = pslWeightedIntronOrientation(psl, genoSeq, offset);
01036 if (intronDir < 0)
01037 intronDir = -1;
01038 else if (intronDir > 0)
01039 intronDir = 1;
01040 return intronDir;
01041 }
01042
01043 boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset)
01044
01045
01046 {
01047 int blockCount = psl->blockCount, i;
01048 unsigned *tStarts = psl->tStarts;
01049 unsigned *blockSizes = psl->blockSizes;
01050 unsigned *qStarts = psl->qStarts;
01051 int blockSize, start, end;
01052 DNA *dna = seq->dna;
01053
01054 for (i=1; i<blockCount; ++i)
01055 {
01056 blockSize = blockSizes[i-1];
01057 start = qStarts[i-1]+blockSize;
01058 end = qStarts[i];
01059 if (start == end)
01060 {
01061 start = tStarts[i-1] + blockSize;
01062 end = tStarts[i];
01063 if (psl->strand[1] == '-')
01064 reverseIntRange(&start, &end, psl->tSize);
01065 start -= seqOffset;
01066 end -= seqOffset;
01067 if (intronOrientation(dna+start, dna+end) != 0)
01068 return TRUE;
01069 }
01070 }
01071 return FALSE;
01072 }
01073
01074 void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail)
01075
01076 {
01077 int orientation = pslOrientation(psl);
01078 int qFloppyStart, qFloppyEnd;
01079 int tFloppyStart, tFloppyEnd;
01080
01081 if (orientation > 0)
01082 {
01083 qFloppyStart = psl->qStart;
01084 qFloppyEnd = psl->qSize - psl->qEnd;
01085 }
01086 else
01087 {
01088 qFloppyStart = psl->qSize - psl->qEnd;
01089 qFloppyEnd = psl->qStart;
01090 }
01091 tFloppyStart = psl->tStart;
01092 tFloppyEnd = psl->tSize - psl->tEnd;
01093 *retStartTail = min(qFloppyStart, tFloppyStart);
01094 *retEndTail = min(qFloppyEnd, tFloppyEnd);
01095 }
01096
01097 static void rcSeqs(char **seqs, unsigned blockCount, unsigned *blockSizes)
01098
01099
01100 {
01101 char *buf, *next;
01102 int i, memSz = 0;
01103
01104
01105 for (i = 0; i < blockCount; i++)
01106 memSz += blockSizes[i]+1;
01107 next = buf = needLargeMem(memSz);
01108
01109
01110 for (i = blockCount-1; i >= 0; i--)
01111 {
01112 int len = strlen(seqs[i]);
01113 reverseComplement(seqs[i], len);
01114 memcpy(next, seqs[i], len+1);
01115 next += len+1;
01116 }
01117
01118
01119 freeMem(seqs[0]);
01120 seqs[0] = buf;
01121 next = buf;
01122
01123 for (i = 0; i < blockCount; i++)
01124 {
01125 seqs[i] = next;
01126 next += blockSizes[i]+1;
01127 }
01128 }
01129
01130 void pslRcBoth(struct psl *psl)
01131
01132
01133 {
01134 int tSize = psl->tSize, qSize = psl->qSize;
01135 int blockCount = psl->blockCount, i;
01136 unsigned *tStarts = psl->tStarts, *qStarts = psl->qStarts, *blockSizes = psl->blockSizes;
01137
01138 reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
01139 for (i=0; i<blockCount; ++i)
01140 {
01141 tStarts[i] = (int)tSize - ((int)tStarts[i] + (int)blockSizes[i]);
01142 qStarts[i] = (int)qSize - ((int)qStarts[i] + (int)blockSizes[i]);
01143 }
01144 reverseUnsigned(tStarts, blockCount);
01145 reverseUnsigned(qStarts, blockCount);
01146 reverseUnsigned(blockSizes, blockCount);
01147 }
01148
01149 void pslRc(struct psl *psl)
01150
01151 {
01152 int i;
01153
01154
01155 psl->strand[0] = (psl->strand[0] != '-') ? '-' : '+';
01156 psl->strand[1] = (psl->strand[1] != '-') ? '-' : '+';
01157
01158 for (i = 0; i < psl->blockCount; i++)
01159 {
01160 psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
01161 psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
01162 }
01163 reverseUnsigned(psl->tStarts, psl->blockCount);
01164 reverseUnsigned(psl->qStarts, psl->blockCount);
01165 reverseUnsigned(psl->blockSizes, psl->blockCount);
01166 if (psl->qSequence != NULL)
01167 {
01168 rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes);
01169 rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes);
01170 }
01171 }
01172
01173
01174 #define swapVars(a, b, tmp) ((tmp) = (a), (a) = (b), (b) = (tmp))
01175
01176 static void swapBlocks(struct psl *psl)
01177
01178 {
01179 int i;
01180 unsigned utmp;
01181 char *stmp;
01182 for (i = 0; i < psl->blockCount; i++)
01183 {
01184 swapVars(psl->qStarts[i], psl->tStarts[i], utmp);
01185 if (psl->qSequence != NULL)
01186 swapVars(psl->qSequence[i], psl->tSequence[i], stmp);
01187 }
01188 }
01189
01190 static void swapRCBlocks(struct psl *psl)
01191
01192
01193 {
01194 int i;
01195 unsigned *uatmp;
01196 char **satmp;
01197 reverseUnsigned(psl->tStarts, psl->blockCount);
01198 reverseUnsigned(psl->qStarts, psl->blockCount);
01199 reverseUnsigned(psl->blockSizes, psl->blockCount);
01200 swapVars(psl->tStarts, psl->qStarts, uatmp);
01201
01202
01203 for (i = 0; i < psl->blockCount; i++)
01204 {
01205 psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
01206 psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
01207 }
01208 if (psl->qSequence != NULL)
01209 {
01210
01211
01212 rcSeqs(psl->qSequence, psl->blockCount, psl->blockSizes);
01213 rcSeqs(psl->tSequence, psl->blockCount, psl->blockSizes);
01214 swapVars(psl->qSequence, psl->tSequence, satmp);
01215 }
01216 }
01217
01218 void pslSwap(struct psl *psl, boolean noRc)
01219
01220
01221 {
01222 int itmp;
01223 unsigned utmp;
01224 char ctmp, *stmp;
01225 swapVars(psl->qBaseInsert, psl->tBaseInsert, utmp);
01226 swapVars(psl->tNumInsert, psl->qNumInsert, utmp);
01227 swapVars(psl->qName, psl->tName, stmp);
01228 swapVars(psl->qSize, psl->tSize, utmp);
01229 swapVars(psl->qStart, psl->tStart, itmp);
01230 swapVars(psl->qEnd, psl->tEnd, itmp);
01231
01232
01233 if (psl->strand[1] != '\0')
01234 {
01235
01236 swapVars(psl->strand[0], psl->strand[1], ctmp);
01237 swapBlocks(psl);
01238 }
01239 else if (noRc)
01240 {
01241
01242 psl->strand[1] = psl->strand[0];
01243 psl->strand[0] = '+';
01244 swapBlocks(psl);
01245 }
01246 else
01247 {
01248
01249 if (psl->strand[0] == '+')
01250 swapBlocks(psl);
01251 else
01252 swapRCBlocks(psl);
01253 }
01254 }
01255
01256 void pslTargetOffset(struct psl *psl, int offset)
01257
01258 {
01259 int i, blockCount = psl->blockCount;
01260 unsigned *tStarts = psl->tStarts;
01261 psl->tStart += offset;
01262 psl->tEnd += offset;
01263 for (i=0; i<blockCount; ++i)
01264 tStarts[i] += offset;
01265 }
01266
01267 void pslDump(struct psl *psl, FILE *f)
01268
01269 {
01270 int i;
01271 fprintf(f, "<PRE>\n");
01272 fprintf(f, "psl %s:%d-%d %s %s:%d-%d %d\n",
01273 psl->qName, psl->qStart, psl->qEnd, psl->strand,
01274 psl->tName, psl->tStart, psl->tEnd, psl->blockCount);
01275 for (i=0; i<psl->blockCount; ++i)
01276 fprintf(f, " size %d, qStart %d, tStart %d\n",
01277 psl->blockSizes[i], psl->qStarts[i], psl->tStarts[i]);
01278 fprintf(f, "</PRE>");
01279 }
01280
01281 static void pslRecalcBounds(struct psl *psl)
01282
01283
01284 {
01285 int qStart, qEnd, tStart, tEnd, size;
01286 int last = psl->blockCount - 1;
01287 qStart = psl->qStarts[0];
01288 tStart = psl->tStarts[0];
01289 size = psl->blockSizes[last];
01290 qEnd = psl->qStarts[last] + size;
01291 tEnd = psl->tStarts[last] + size;
01292 if (psl->strand[0] == '-')
01293 reverseIntRange(&qStart, &qEnd, psl->qSize);
01294 if (psl->strand[1] == '-')
01295 reverseIntRange(&tStart, &tEnd, psl->tSize);
01296 psl->qStart = qStart;
01297 psl->qEnd = qEnd;
01298 psl->tStart = tStart;
01299 psl->tEnd = tEnd;
01300 }
01301
01302 struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax)
01303
01304
01305 {
01306 int newSize;
01307 int oldBlockCount = oldPsl->blockCount;
01308 boolean tIsRc = (oldPsl->strand[1] == '-');
01309 int newBlockCount = 0, completeBlockCount = 0;
01310 int i;
01311 struct psl *newPsl = NULL;
01312 int tMn = tMin, tMx = tMax;
01313
01314
01315 newSize = rangeIntersection(oldPsl->tStart, oldPsl->tEnd, tMin, tMax);
01316 if (newSize <= 0)
01317 return NULL;
01318
01319 if (tIsRc)
01320 reverseIntRange(&tMn, &tMx, oldPsl->tSize);
01321
01322
01323 oldBlockCount = oldPsl->blockCount;
01324 for (i=0; i<oldBlockCount; ++i)
01325 {
01326 int s = oldPsl->tStarts[i];
01327 int e = s + oldPsl->blockSizes[i];
01328 int sz = e - s;
01329 int overlap;
01330 if ((overlap = rangeIntersection(s, e, tMn, tMx)) > 0)
01331 ++newBlockCount;
01332 if (overlap == sz)
01333 ++completeBlockCount;
01334 }
01335
01336 if (newBlockCount == 0)
01337 return NULL;
01338
01339
01340 AllocVar(newPsl);
01341 strcpy(newPsl->strand, oldPsl->strand);
01342 newPsl->qName = cloneString(oldPsl->qName);
01343 newPsl->qSize = oldPsl->qSize;
01344 newPsl->tName = cloneString(oldPsl->tName);
01345 newPsl->tSize = oldPsl->tSize;
01346 newPsl->blockCount = newBlockCount;
01347 AllocArray(newPsl->blockSizes, newBlockCount);
01348 AllocArray(newPsl->qStarts, newBlockCount);
01349 AllocArray(newPsl->tStarts, newBlockCount);
01350
01351
01352 newBlockCount = completeBlockCount = 0;
01353 for (i=0; i<oldBlockCount; ++i)
01354 {
01355 int oldSz = oldPsl->blockSizes[i];
01356 int sz = oldSz;
01357 int tS = oldPsl->tStarts[i];
01358 int tE = tS + sz;
01359 int qS = oldPsl->qStarts[i];
01360 int qE = qS + sz;
01361 if (rangeIntersection(tS, tE, tMn, tMx) > 0)
01362 {
01363 int diff;
01364 if ((diff = (tMn - tS)) > 0)
01365 {
01366 tS += diff;
01367 qS += diff;
01368 sz -= diff;
01369 }
01370 if ((diff = (tE - tMx)) > 0)
01371 {
01372 tE -= diff;
01373 qE -= diff;
01374 sz -= diff;
01375 }
01376 newPsl->qStarts[newBlockCount] = qS;
01377 newPsl->tStarts[newBlockCount] = tS;
01378 newPsl->blockSizes[newBlockCount] = sz;
01379 ++newBlockCount;
01380 if (sz == oldSz)
01381 ++completeBlockCount;
01382 }
01383 }
01384 pslRecalcBounds(newPsl);
01385 return newPsl;
01386 }
01387
01388 struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax)
01389
01390
01391 {
01392 int newSize;
01393 int oldBlockCount = oldPsl->blockCount;
01394 boolean qIsRc = (oldPsl->strand[0] == '-');
01395 int newBlockCount = 0, completeBlockCount = 0;
01396 int i;
01397 struct psl *newPsl = NULL;
01398 int qMn = qMin, qMx = qMax;
01399
01400
01401 newSize = rangeIntersection(oldPsl->qStart, oldPsl->qEnd, qMin, qMax);
01402 if (newSize <= 0)
01403 return NULL;
01404
01405 if (qIsRc)
01406 reverseIntRange(&qMn, &qMx, oldPsl->qSize);
01407
01408
01409 oldBlockCount = oldPsl->blockCount;
01410 for (i=0; i<oldBlockCount; ++i)
01411 {
01412 int s = oldPsl->qStarts[i];
01413 int e = s + oldPsl->blockSizes[i];
01414 int sz = e - s;
01415 int overlap;
01416 if ((overlap = rangeIntersection(s, e, qMn, qMx)) > 0)
01417 ++newBlockCount;
01418 if (overlap == sz)
01419 ++completeBlockCount;
01420 }
01421
01422 if (newBlockCount == 0)
01423 return NULL;
01424
01425
01426 AllocVar(newPsl);
01427 strcpy(newPsl->strand, oldPsl->strand);
01428 newPsl->qName = cloneString(oldPsl->qName);
01429 newPsl->qSize = oldPsl->qSize;
01430 newPsl->tName = cloneString(oldPsl->tName);
01431 newPsl->tSize = oldPsl->tSize;
01432 newPsl->blockCount = newBlockCount;
01433 AllocArray(newPsl->blockSizes, newBlockCount);
01434 AllocArray(newPsl->qStarts, newBlockCount);
01435 AllocArray(newPsl->tStarts, newBlockCount);
01436
01437
01438 newBlockCount = completeBlockCount = 0;
01439 for (i=0; i<oldBlockCount; ++i)
01440 {
01441 int oldSz = oldPsl->blockSizes[i];
01442 int sz = oldSz;
01443 int qS = oldPsl->qStarts[i];
01444 int qE = qS + sz;
01445 int tS = oldPsl->tStarts[i];
01446 int tE = tS + sz;
01447 if (rangeIntersection(qS, qE, qMn, qMx) > 0)
01448 {
01449 int diff;
01450 if ((diff = (qMn - qS)) > 0)
01451 {
01452 tS += diff;
01453 qS += diff;
01454 sz -= diff;
01455 }
01456 if ((diff = (qE - qMx)) > 0)
01457 {
01458 tE -= diff;
01459 qE -= diff;
01460 sz -= diff;
01461 }
01462 newPsl->qStarts[newBlockCount] = qS;
01463 newPsl->tStarts[newBlockCount] = tS;
01464 newPsl->blockSizes[newBlockCount] = sz;
01465 ++newBlockCount;
01466 if (sz == oldSz)
01467 ++completeBlockCount;
01468 }
01469 }
01470 pslRecalcBounds(newPsl);
01471 return newPsl;
01472 }
01473 char* pslGetCreateSql(char* table, unsigned options, int tNameIdxLen)
01474
01475
01476
01477
01478
01479 {
01480 struct dyString *sqlCmd = newDyString(2048);
01481 char *sqlCmdStr;
01482 char binIx[32];
01483
01484 binIx[0] = '\0';
01485
01486
01487 if ((tNameIdxLen > 0) && !(options & PSL_TNAMEIX))
01488 errAbort("pslGetCreateSql: must specify PSL_TNAMEIX with tNameIdxLen > 0");
01489 if ((options & PSL_TNAMEIX) && (tNameIdxLen == 0))
01490 tNameIdxLen = 8;
01491
01492
01493 if (options & PSL_WITH_BIN)
01494 {
01495 if (options & PSL_TNAMEIX)
01496 safef(binIx, sizeof(binIx), "INDEX(tName(%d),bin),\n", tNameIdxLen);
01497 else
01498 safef(binIx, sizeof(binIx), "INDEX(bin),\n");
01499 }
01500 dyStringPrintf(sqlCmd, createString, table,
01501 ((options & PSL_WITH_BIN) ? "bin smallint unsigned not null,\n" : ""));
01502 if (options & PSL_XA_FORMAT)
01503 {
01504 dyStringPrintf(sqlCmd, "qSeq longblob not null,\n");
01505 dyStringPrintf(sqlCmd, "tSeq longblob not null,\n");
01506 }
01507 dyStringPrintf(sqlCmd, indexString, binIx);
01508 sqlCmdStr = cloneString(sqlCmd->string);
01509 dyStringFree(&sqlCmd);
01510 return sqlCmdStr;
01511 }
01512
01513 static void printPslDesc(char* pslDesc, FILE* out, struct psl* psl)
01514
01515 {
01516 fprintf(out, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n",
01517 psl->qName, psl->qStart, psl->qEnd,
01518 psl->tName, psl->tStart, psl->tEnd,
01519 psl->strand, pslDesc);
01520 }
01521
01522 static void chkRanges(char* pslDesc, FILE* out, struct psl* psl,
01523 char* pName, char* pLabel, char pCLabel, char pStrand,
01524 unsigned pSize, unsigned pStart, unsigned pEnd,
01525 unsigned blockCount, unsigned* blockSizes,
01526 unsigned* pBlockStarts, int* errCountPtr)
01527
01528 {
01529 int errCount = *errCountPtr;
01530 unsigned iBlk, prevBlkEnd = 0;
01531
01532 if (pStart >= pEnd)
01533 {
01534 if (errCount == 0)
01535 printPslDesc(pslDesc, out, psl);
01536 fprintf(out, "\t%s %cStart %u >= %cEnd %u\n",
01537 pName, pCLabel, pStart, pCLabel, pEnd);
01538 errCount++;
01539 }
01540 if (pEnd > pSize)
01541 {
01542 if (errCount == 0)
01543 printPslDesc(pslDesc, out, psl);
01544 fprintf(out, "\t%s %cEnd %u >= %cSize %u\n",
01545 pName, pCLabel, pEnd, pCLabel, pSize);
01546 errCount++;
01547 }
01548 for (iBlk = 0; iBlk < blockCount; iBlk++)
01549 {
01550 unsigned blkStart = pBlockStarts[iBlk];
01551 unsigned blkEnd = blkStart+blockSizes[iBlk];
01552
01553 unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd);
01554 unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart);
01555
01556 if ((pSize > 0) && (blkEnd > pSize))
01557 {
01558 if (errCount == 0)
01559 printPslDesc(pslDesc, out, psl);
01560 fprintf(out, "\t%s %s block %u end %u > %cSize %u\n",
01561 pName, pLabel, iBlk, blkEnd, pCLabel, pSize);
01562 errCount++;
01563 }
01564 if (gBlkStart < pStart)
01565 {
01566 if (errCount == 0)
01567 printPslDesc(pslDesc, out, psl);
01568 fprintf(out, "\t%s %s block %u start %u < %cStart %u\n",
01569 pName, pLabel, iBlk, gBlkStart, pCLabel, pStart);
01570 errCount++;
01571 }
01572 if (gBlkStart >= pEnd)
01573 {
01574 if (errCount == 0)
01575 printPslDesc(pslDesc, out, psl);
01576 fprintf(out, "\t%s %s block %u start %u >= %cEnd %u\n",
01577 pName, pLabel, iBlk, gBlkStart, pCLabel, pEnd);
01578 errCount++;
01579 }
01580 if (gBlkEnd < pStart)
01581 {
01582 if (errCount == 0)
01583 printPslDesc(pslDesc, out, psl);
01584 fprintf(out, "\t%s %s block %u end %u < %cStart %u\n",
01585 pName, pLabel, iBlk, gBlkEnd, pCLabel, pStart);
01586 errCount++;
01587 }
01588 if (gBlkEnd > pEnd)
01589 {
01590 if (errCount == 0)
01591 printPslDesc(pslDesc, out, psl);
01592 fprintf(out, "\t%s %s block %u end %u > %cEnd %u\n",
01593 pName, pLabel, iBlk, gBlkEnd, pCLabel, pEnd);
01594 errCount++;
01595 }
01596 if ((iBlk > 0) && (blkStart < prevBlkEnd))
01597 {
01598 if (errCount == 0)
01599 printPslDesc(pslDesc, out, psl);
01600 fprintf(out, "\t%s %s block %u start %u < previous block end %u\n",
01601 pName, pLabel, iBlk, blkStart, prevBlkEnd);
01602 errCount++;
01603 }
01604 prevBlkEnd = blkEnd;
01605 }
01606 *errCountPtr = errCount;
01607 }
01608
01609 int pslCheck(char *pslDesc, FILE* out, struct psl* psl)
01610
01611
01612 {
01613 static char* VALID_STRANDS[] = {
01614 "+", "-", "++", "+-", "-+", "--", NULL
01615 };
01616 int i, errCount = 0;
01617 char strand;
01618 boolean isProt = FALSE;
01619
01620
01621 for (i = 0; VALID_STRANDS[i] != NULL; i++)
01622 {
01623 if (strcmp(psl->strand, VALID_STRANDS[i]) == 0)
01624 break;
01625 }
01626 if (VALID_STRANDS[i] == NULL)
01627 {
01628 if (errCount == 0)
01629 printPslDesc(pslDesc, out, psl);
01630 fprintf(out, "\tinvalid PSL strand: \"%s\"\n", psl->strand);
01631 errCount++;
01632 }
01633
01634
01635 if (pslIsProtein(psl))
01636 {
01637 isProt = TRUE;
01638 for (i = 0; i < psl->blockCount ; i++)
01639 psl->blockSizes[i] *= 3;
01640 }
01641
01642 strand = ((psl->strand[1] == '\0') ? '+' : psl->strand[1]);
01643 chkRanges(pslDesc, out, psl, psl->tName, "target", 't',
01644 strand, psl->tSize, psl->tStart, psl->tEnd,
01645 psl->blockCount, psl->blockSizes, psl->tStarts,
01646 &errCount);
01647 if (isProt)
01648 {
01649 for (i = 0; i < psl->blockCount ; i++)
01650 psl->blockSizes[i] /= 3;
01651 }
01652
01653
01654 strand = psl->strand[0];
01655 chkRanges(pslDesc, out, psl, psl->qName, "query", 'q',
01656 strand, psl->qSize, psl->qStart, psl->qEnd,
01657 psl->blockCount, psl->blockSizes, psl->qStarts,
01658 &errCount);
01659
01660 return errCount;
01661 }
01662
01663 struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName)
01664
01665 {
01666 struct binKeeper *bk;
01667 struct psl *psl;
01668 struct lineFile *sf = lineFileOpen(sizeFileName, TRUE);
01669 struct lineFile *pf = lineFileOpen(pslFileName , TRUE);
01670 struct hash *hash = newHash(0);
01671 char *chromRow[2];
01672 char *row[21] ;
01673
01674 while (lineFileRow(sf, chromRow))
01675 {
01676 char *name = chromRow[0];
01677 int size = lineFileNeedNum(sf, chromRow, 1);
01678
01679 if (hashLookup(hash, name) != NULL)
01680 warn("Duplicate %s, ignoring all but first\n", name);
01681 else
01682 {
01683 bk = binKeeperNew(0, size);
01684 assert(size > 1);
01685 hashAdd(hash, name, bk);
01686 }
01687 }
01688 while (lineFileNextRow(pf, row, ArraySize(row)))
01689 {
01690 psl = pslLoad(row);
01691 bk = hashMustFindVal(hash, psl->tName);
01692 binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
01693 }
01694 lineFileClose(&pf);
01695 lineFileClose(&sf);
01696 return hash;
01697 }
01698
01699 static int countInitialChars(char *s, char c)
01700
01701 {
01702 int count = 0;
01703 char d;
01704 while ((d = *s++) != 0)
01705 {
01706 if (c == d)
01707 ++count;
01708 else
01709 break;
01710 }
01711 return count;
01712 }
01713
01714 static int countTerminalChars(char *s, char c)
01715
01716 {
01717 int len = strlen(s), i;
01718 int count = 0;
01719 for (i=len-1; i>=0; --i)
01720 {
01721 if (c == s[i])
01722 ++count;
01723 else
01724 break;
01725 }
01726 return count;
01727 }
01728
01729 static int countNonInsert(char *s, int size)
01730
01731
01732 {
01733 int count = 0;
01734 int i;
01735 for (i=0; i<size; ++i)
01736 if (*s++ != '-')
01737 ++count;
01738 return count;
01739 }
01740
01741 static void trimAlignment(struct psl* psl, char** qStringPtr, char** tStringPtr,
01742 int* aliSizePtr)
01743
01744 {
01745 char* qString = *qStringPtr;
01746 char* tString = *tStringPtr;
01747 int aliSize = *aliSizePtr;
01748 int qStartInsert = countInitialChars(qString, '-');
01749 int tStartInsert = countInitialChars(tString, '-');
01750 int qEndInsert = countTerminalChars(qString, '-');
01751 int tEndInsert = countTerminalChars(tString, '-');
01752 int startInsert = max(qStartInsert, tStartInsert);
01753 int endInsert = max(qEndInsert, tEndInsert);
01754 int qNonCount, tNonCount;
01755
01756 if (startInsert > 0)
01757 {
01758 qNonCount = countNonInsert(qString, startInsert);
01759 tNonCount = countNonInsert(tString, startInsert);
01760 qString += startInsert;
01761 tString += startInsert;
01762 aliSize -= startInsert;
01763 psl->qStart += qNonCount;
01764 psl->tStart += tNonCount;
01765 }
01766 if (endInsert > 0)
01767 {
01768 aliSize -= endInsert;
01769 qNonCount = countNonInsert(qString+aliSize, endInsert);
01770 tNonCount = countNonInsert(tString+aliSize, endInsert);
01771 qString[aliSize] = 0;
01772 tString[aliSize] = 0;
01773 psl->qEnd -= qNonCount;
01774 psl->tEnd -= tNonCount;
01775 }
01776 *qStringPtr = qString;
01777 *tStringPtr = tString;
01778 *aliSizePtr = aliSize;
01779
01780 if (startInsert > 0 || endInsert > 0)
01781 trimAlignment(psl, qStringPtr, tStringPtr, aliSizePtr);
01782 }
01783
01784 static void addBlock(struct psl* psl, int qs, int qe, int ts, int te,
01785 int *blockSpace)
01786
01787 {
01788 assert((qe-qs) == (te-ts));
01789 if (psl->blockCount == *blockSpace)
01790 pslGrow(psl, blockSpace);
01791 psl->blockSizes[psl->blockCount] = qe - qs;
01792 psl->qStarts[psl->blockCount] = qs;
01793 psl->tStarts[psl->blockCount] = ts;
01794 psl->blockCount++;
01795 }
01796
01797 static void accumCounts(struct psl *psl, char prevQ, char prevT,
01798 char q, char t, unsigned options)
01799
01800 {
01801 if ((q != '-') && (t != '-'))
01802 {
01803
01804 char qu = toupper(q);
01805 char tu = toupper(t);
01806 if ((q == 'N') || (t == 'N'))
01807 psl->nCount++;
01808 else if (qu == tu)
01809 {
01810 if ((options & PSL_IS_SOFTMASK) && ((qu != q) || (tu != t)))
01811 psl->repMatch++;
01812 else
01813 psl->match++;
01814 }
01815 else
01816 psl->misMatch++;
01817 }
01818 else if ((q == '-') && (t != '-'))
01819 {
01820
01821 psl->tBaseInsert++;
01822 if (prevQ != '-')
01823 psl->tNumInsert++;
01824 }
01825 else if ((t == '-') && (q != '-'))
01826 {
01827
01828 psl->qBaseInsert++;
01829 if (prevT != '-')
01830 psl->qNumInsert++;
01831 }
01832 }
01833
01834 struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
01835 char *tName, int tSize, int tStart, int tEnd, char *tString,
01836 char* strand, unsigned options)
01837
01838
01839
01840 {
01841
01842
01843
01844
01845
01846
01847
01848 int blockSpace = 16;
01849 struct psl* psl = NULL;
01850 int aliSize = strlen(qString);
01851 boolean eitherInsert = FALSE;
01852 int i, qs,qe,ts,te;
01853 char prevQ = '\0', prevT = '\0';
01854 AllocVar(psl);
01855
01856 if (strlen(tString) != aliSize)
01857 errAbort("query and target alignment strings are different lengths");
01858
01859 psl = pslNew(qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd,
01860 strand, blockSpace, 0);
01861 trimAlignment(psl, &qString, &tString, &aliSize);
01862
01863
01864 if ((psl->qStart == psl->qEnd) || (psl->tStart == psl->tEnd))
01865 {
01866 pslFree(&psl);
01867 return NULL;
01868 }
01869
01870
01871 qs = psl->qStart;
01872 qe = psl->qEnd;
01873 if (strand[0] == '-')
01874 reverseIntRange(&qs, &qe, psl->qSize);
01875 ts = psl->tStart;
01876 te = psl->tEnd;
01877 if (strand[1] == '-')
01878 reverseIntRange(&ts, &te, psl->tSize);
01879
01880 eitherInsert = FALSE;
01881 qe = qs;
01882 te = ts;
01883 for (i=0; i<aliSize; ++i)
01884 {
01885 char q = qString[i];
01886 char t = tString[i];
01887 if ((q == '-') && (t == '-'))
01888 {
01889 continue;
01890 }
01891 else if ((q == '-') || (t == '-'))
01892 {
01893
01894 if (!eitherInsert)
01895 {
01896
01897 addBlock(psl, qs, qe, ts, te, &blockSpace);
01898 eitherInsert = TRUE;
01899 }
01900 if (q != '-')
01901 qe += 1;
01902 if (t != '-')
01903 te += 1;
01904 }
01905 else
01906 {
01907
01908 if (eitherInsert)
01909 {
01910
01911 qs = qe;
01912 ts = te;
01913 eitherInsert = FALSE;
01914 }
01915 qe += 1;
01916 te += 1;
01917 }
01918 accumCounts(psl, prevQ, prevT, q, t, options);
01919 prevQ = q;
01920 prevT = t;
01921 }
01922 addBlock(psl, qs, qe, ts, te, &blockSpace);
01923 return psl;
01924 }
01925
01926 struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
01927 char *tName, unsigned tSize, int tStart, int tEnd,
01928 char *strand, unsigned blockSpace, unsigned opts)
01929
01930
01931
01932 {
01933 struct psl *psl;
01934 AllocVar(psl);
01935 assert(blockSpace > 0);
01936 psl->qName = cloneString(qName);
01937 psl->qSize = qSize;
01938 psl->qStart = qStart;
01939 psl->qEnd = qEnd;
01940 psl->tName = cloneString(tName);
01941 psl->tSize = tSize;
01942 psl->tStart = tStart;
01943 psl->tEnd = tEnd;
01944 strncpy(psl->strand, strand, 2);
01945 AllocArray(psl->blockSizes, blockSpace);
01946 AllocArray(psl->qStarts, blockSpace);
01947 AllocArray(psl->tStarts, blockSpace);
01948 if (opts & PSL_XA_FORMAT)
01949 {
01950 AllocArray(psl->qSequence, blockSpace);
01951 AllocArray(psl->tSequence, blockSpace);
01952 }
01953 return psl;
01954 }
01955
01956 void pslGrow(struct psl *psl, int *blockSpacePtr)
01957
01958
01959
01960 {
01961 int blockSpace = *blockSpacePtr;
01962 int newSpace = 2 * blockSpace;
01963 ExpandArray(psl->blockSizes, blockSpace, newSpace);
01964 ExpandArray(psl->qStarts, blockSpace, newSpace);
01965 ExpandArray(psl->tStarts, blockSpace, newSpace);
01966 if (psl->qSequence != NULL)
01967 {
01968 ExpandArray(psl->qSequence, blockSpace, newSpace);
01969 ExpandArray(psl->tSequence, blockSpace, newSpace);
01970 }
01971 *blockSpacePtr = newSpace;
01972 }
01973
01974 int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree)
01975
01976 {
01977 int i;
01978 int overlap = 0;
01979 boolean isRc = (psl->strand[1] == '-');
01980 for (i=0; i<psl->blockCount; ++i)
01981 {
01982 int start = psl->tStarts[i];
01983 int end = start + psl->blockSizes[i];
01984 if (isRc)
01985 reverseIntRange(&start, &end, psl->tSize);
01986 overlap += rangeTreeOverlapSize(rangeTree, start, end);
01987 }
01988 return overlap;
01989 }
01990