00001
00002
00003
00004
00005
00006
00007
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "dnaseq.h"
00011 #include "fa.h"
00012 #include "gdf.h"
00013 #include "nt4.h"
00014 #include "snof.h"
00015 #include "wormdna.h"
00016 #include "cda.h"
00017 #include "sig.h"
00018 #include "dystring.h"
00019
00020 static char const rcsid[] = "$Id: wormdna.c,v 1.10 2005/04/10 14:41:26 markd Exp $";
00021
00022 static char *jkwebDir = NULL;
00023
00024 static char *cdnaDir = NULL;
00025 static char *featDir = NULL;
00026 static char *nt4Dir = NULL;
00027 static char *sangerDir = NULL;
00028 static char *genieDir = NULL;
00029 static char *xenoDir = NULL;
00030
00031 static void getDirs()
00032
00033 {
00034 if (jkwebDir == NULL)
00035 {
00036 char buf[512];
00037
00038
00039
00040 if ((jkwebDir = getenv("JKWEB")) == NULL)
00041 jkwebDir = "";
00042
00043 sprintf(buf, "%scdna.dir", jkwebDir);
00044 firstWordInFile(buf, buf, sizeof(buf));
00045 cdnaDir = cloneString(buf);
00046
00047 sprintf(buf, "%sfeat.dir", jkwebDir);
00048 firstWordInFile(buf, buf, sizeof(buf));
00049 featDir = cloneString(buf);
00050
00051 sprintf(buf, "%snt4.dir", jkwebDir);
00052 firstWordInFile(buf, buf, sizeof(buf));
00053 nt4Dir = cloneString(buf);
00054
00055 sprintf(buf, "%ssanger/", featDir);
00056 sangerDir = cloneString(buf);
00057
00058 sprintf(buf, "%sgenie/", featDir);
00059 genieDir = cloneString(buf);
00060
00061 sprintf(buf, "%sxeno.dir", jkwebDir);
00062 firstWordInFile(buf, buf, sizeof(buf));
00063 xenoDir = cloneString(buf);
00064 }
00065 }
00066
00067 char *wormFeaturesDir()
00068
00069 {
00070 getDirs();
00071 return featDir;
00072 }
00073
00074 char *wormChromDir()
00075
00076 {
00077 getDirs();
00078 return nt4Dir;
00079 }
00080
00081 char *wormCdnaDir()
00082
00083 {
00084 getDirs();
00085 return cdnaDir;
00086 }
00087
00088 char *wormSangerDir()
00089
00090 {
00091 getDirs();
00092 return sangerDir;
00093 }
00094
00095 char *wormGenieDir()
00096
00097 {
00098 getDirs();
00099 return genieDir;
00100 }
00101
00102 char *wormXenoDir()
00103
00104 {
00105 getDirs();
00106 return xenoDir;
00107 }
00108
00109 static char *chromIds[] = {"i", "ii", "iii", "iv", "v", "x", "m", };
00110
00111 void wormChromNames(char ***retNames, int *retNameCount)
00112
00113 {
00114 *retNames = chromIds;
00115 *retNameCount = ArraySize(chromIds);
00116 }
00117
00118 int wormChromIx(char *name)
00119
00120 {
00121 return stringIx(name, chromIds);
00122 }
00123
00124 char *wormChromForIx(int ix)
00125
00126 {
00127 assert(ix >= 0 && ix <= ArraySize(chromIds));
00128 return chromIds[ix];
00129 }
00130
00131 char *wormOfficialChromName(char *name)
00132
00133
00134
00135 {
00136 int ix = wormChromIx(name);
00137 if (ix < 0) return NULL;
00138 return chromIds[ix];
00139 }
00140
00141
00142 static struct snof *cdnaSnof = NULL;
00143 static FILE *cdnaFa = NULL;
00144
00145 static void wormCdnaCache()
00146
00147 {
00148 getDirs();
00149 if (cdnaSnof == NULL)
00150 {
00151 char buf[512];
00152
00153 sprintf(buf, "%s%s", cdnaDir, "allcdna");
00154 cdnaSnof = snofMustOpen(buf);
00155 sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
00156 cdnaFa = mustOpen(buf, "rb");
00157 }
00158 }
00159
00160 void wormCdnaUncache()
00161
00162 {
00163 snofClose(&cdnaSnof);
00164 carefulClose(&cdnaFa);
00165 freez(&cdnaDir);
00166 }
00167
00168 void wormFreeCdnaInfo(struct wormCdnaInfo *ci)
00169
00170
00171 {
00172 freeMem(ci->motherString);
00173 zeroBytes(ci, sizeof(*ci));
00174 }
00175
00176 static char *realInfoString(char *s)
00177
00178 {
00179 if (s[0] == '?' && s[1] == 0) return NULL;
00180 return s;
00181 }
00182
00183 static void parseRestOfCdnaInfo(char *textInfo, struct wormCdnaInfo *retInfo)
00184
00185 {
00186 int wordCount;
00187 char *words[32];
00188 char *s;
00189
00190 wordCount = chopString(textInfo, "|", words, ArraySize(words));
00191 if (wordCount < 8)
00192 errAbort("Expecting at least 8 fields in cDNA database, got %d", wordCount);
00193 if ((s = realInfoString(words[0])) != NULL)
00194 retInfo->orientation = s[0];
00195 retInfo->gene = realInfoString(words[1]);
00196 retInfo->product = realInfoString(words[2]);
00197 if ((s = realInfoString(words[3])) != NULL)
00198 {
00199 char *parts[2];
00200 int partCount;
00201 partCount = chopString(s, ".", parts, ArraySize(parts));
00202 if (partCount == 2)
00203 {
00204 retInfo->knowStart = retInfo->knowEnd = TRUE;
00205 if (parts[0][0] == '<')
00206 {
00207 retInfo->knowStart = FALSE;
00208 parts[0] += 1;
00209 }
00210 if (parts[1][0] == '>')
00211 {
00212 retInfo->knowEnd = FALSE;
00213 parts[1] += 1;
00214 }
00215 retInfo->cdsStart = atoi(parts[0]);
00216 retInfo->cdsEnd = atoi(parts[1]);
00217 }
00218 }
00219 if ((s = realInfoString(words[4])) != NULL)
00220 {
00221 if (sameString("embryo", s))
00222 retInfo->isEmbryonic = TRUE;
00223 else if (sameString("adult", s))
00224 retInfo->isAdult = TRUE;
00225 }
00226 if ((s = realInfoString(words[5])) != NULL)
00227 {
00228 if (sameString("herm", s))
00229 retInfo->isHermaphrodite = TRUE;
00230 else if (sameString("male", s))
00231 retInfo->isMale = TRUE;
00232 }
00233
00234 if ((s = realInfoString(words[6])) != NULL)
00235 {
00236
00237 }
00238 retInfo->description = realInfoString(words[7]);
00239 }
00240
00241 void wormFaCommentIntoInfo(char *faComment, struct wormCdnaInfo *retInfo)
00242
00243
00244 {
00245 if (retInfo)
00246 {
00247 char *s;
00248 zeroBytes(retInfo, sizeof(*retInfo));
00249
00250 s = strchr(faComment, ' ');
00251 if (s == NULL)
00252 errAbort("Expecting lots of info, just got %s", faComment);
00253 *s++ = 0;
00254 retInfo->name = faComment+1;
00255 retInfo->motherString = faComment;
00256
00257 parseRestOfCdnaInfo(s, retInfo);
00258 }
00259 }
00260
00261 boolean wormCdnaInfo(char *name, struct wormCdnaInfo *retInfo)
00262
00263 {
00264 char commentBuf[512];
00265 char *comment;
00266 long offset;
00267
00268 wormCdnaCache();
00269 if (!snofFindOffset(cdnaSnof, name, &offset))
00270 return FALSE;
00271 fseek(cdnaFa, offset, SEEK_SET);
00272 fgets(commentBuf, sizeof(commentBuf), cdnaFa);
00273 if (commentBuf[0] != '>')
00274 errAbort("Expecting line starting with > in cDNA fa file.\nGot %s", commentBuf);
00275 comment = cloneString(commentBuf);
00276 wormFaCommentIntoInfo(comment, retInfo);
00277 return TRUE;
00278 }
00279
00280 boolean wormCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo)
00281
00282
00283 {
00284 long offset;
00285 char *faComment;
00286 char **pFaComment = (retInfo == NULL ? NULL : &faComment);
00287
00288 wormCdnaCache();
00289 if (!snofFindOffset(cdnaSnof, name, &offset))
00290 return FALSE;
00291 fseek(cdnaFa, offset, SEEK_SET);
00292 if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna))
00293 return FALSE;
00294 wormFaCommentIntoInfo(faComment, retInfo);
00295 return TRUE;
00296 }
00297
00298 struct wormFeature *newWormFeature(char *name, char *chrom, int start, int end, char typeByte)
00299
00300 {
00301 int size = sizeof(struct wormFeature) + strlen(name);
00302 struct wormFeature *feat = needMem(size);
00303 feat->chrom = chrom;
00304 feat->start = start;
00305 feat->end = end;
00306 feat->typeByte = typeByte;
00307 strcpy(feat->name, name);
00308 return feat;
00309 }
00310
00311
00312 static struct wormFeature *scanChromOffsetFile(char *dir, char *suffix,
00313 bits32 signature, int nameOffset, char *chromId, int start, int end,
00314 int addEnd)
00315
00316
00317 {
00318 FILE *f;
00319 char fileName[512];
00320 bits32 sig, nameSize, entryCount;
00321 int entrySize;
00322 int *entry;
00323 char *name;
00324 bits32 i;
00325 struct wormFeature *list = NULL, *el;
00326 char *typePt;
00327 char typeByte;
00328
00329 sprintf(fileName, "%s%s%s", dir, chromId, suffix);
00330 f = mustOpen(fileName, "rb");
00331 mustReadOne(f, sig);
00332 if (sig != signature)
00333 errAbort("Bad signature on %s", fileName);
00334 mustReadOne(f, entryCount);
00335 mustReadOne(f, nameSize);
00336 entrySize = nameSize + nameOffset;
00337 entry = needMem(entrySize + 1);
00338 name = (char *)entry;
00339 name += nameOffset;
00340 typePt = name-1;
00341 for (i=0; i<entryCount; ++i)
00342 {
00343 mustRead(f, entry, entrySize);
00344 if (entry[0] > end)
00345 break;
00346 if (entry[1] < start)
00347 continue;
00348 typeByte = *typePt;
00349 el = newWormFeature(name, chromId, entry[0], entry[1]+addEnd, typeByte);
00350 slAddHead(&list, el);
00351 }
00352 slReverse(&list);
00353 fclose(f);
00354 freeMem(entry);
00355 return list;
00356 }
00357
00358 struct wormFeature *wormCdnasInRange(char *chromId, int start, int end)
00359
00360
00361 {
00362
00363
00364 getDirs();
00365 return scanChromOffsetFile(cdnaDir, ".cdo", cdoSig, 2*sizeof(int)+1,
00366 chromId, start, end, 0);
00367 }
00368
00369 struct wormFeature *wormSomeGenesInRange(char *chromId, int start, int end, char *gdfDir)
00370
00371 {
00372 return scanChromOffsetFile(gdfDir, ".pgo", pgoSig, 2*sizeof(int)+1,
00373 chromId, start, end, 0);
00374 }
00375
00376 struct wormFeature *wormGenesInRange(char *chromId, int start, int end)
00377
00378 {
00379
00380
00381 getDirs();
00382 return wormSomeGenesInRange(chromId, start, end, sangerDir);
00383 }
00384
00385 struct wormFeature *wormCosmidsInRange(char *chromId, int start, int end)
00386
00387 {
00388
00389
00390 getDirs();
00391 return scanChromOffsetFile(featDir, ".coo", pgoSig, 2*sizeof(int)+1,
00392 chromId, start, end, 1);
00393 }
00394
00395 FILE *wormOpenGoodAli()
00396
00397
00398 {
00399 char fileName[512];
00400 getDirs();
00401 sprintf(fileName, "%sgood.ali", cdnaDir);
00402 return cdaOpenVerify(fileName);
00403 }
00404
00405 struct cdaAli *wormCdaAlisInRange(char *chromId, int start, int end)
00406
00407 {
00408 struct cdaAli *list = NULL, *el;
00409 char fileName[512];
00410 FILE *ixFile, *aliFile;
00411 bits32 sig;
00412 int s, e;
00413 long fpos;
00414
00415 aliFile = wormOpenGoodAli();
00416
00417 sprintf(fileName, "%s%s.alx", cdnaDir, chromId);
00418 ixFile = mustOpen(fileName, "rb");
00419 mustReadOne(ixFile, sig);
00420 if (sig != alxSig)
00421 errAbort("Bad signature on %s", fileName);
00422
00423 for (;;)
00424 {
00425 if (!readOne(ixFile, s))
00426 break;
00427 mustReadOne(ixFile, e);
00428 mustReadOne(ixFile, fpos);
00429 if (e <= start)
00430 continue;
00431 if (s >= end)
00432 break;
00433 AllocVar(el);
00434 fseek(aliFile, fpos, SEEK_SET);
00435 el = cdaLoadOne(aliFile);
00436 if (el == NULL)
00437 errAbort("Truncated cdnaAli file");
00438 slAddHead(&list, el);
00439 }
00440 slReverse(&list);
00441 fclose(aliFile);
00442 fclose(ixFile);
00443 return list;
00444 }
00445
00446 boolean nextWormCdnaAndInfo(struct wormCdnaIterator *it, struct dnaSeq **retSeq,
00447 struct wormCdnaInfo *retInfo)
00448
00449 {
00450 char *faComment;
00451
00452 if (!faReadNext(it->faFile, "unknown", TRUE, &faComment, retSeq))
00453 return FALSE;
00454 wormFaCommentIntoInfo(faComment, retInfo);
00455 return TRUE;
00456 }
00457
00458 struct dnaSeq *nextWormCdna(struct wormCdnaIterator *it)
00459
00460 {
00461 return faReadOneDnaSeq(it->faFile, "unknown", TRUE);
00462 }
00463
00464 boolean wormSearchAllCdna(struct wormCdnaIterator **retSi)
00465
00466 {
00467 char buf[512];
00468 struct wormCdnaIterator *it;
00469
00470 it = needMem(sizeof(*it));
00471 getDirs();
00472 sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
00473 it->faFile = mustOpen(buf, "rb");
00474 *retSi = it;
00475 return TRUE;
00476 }
00477
00478 void freeWormCdnaIterator(struct wormCdnaIterator **pIt)
00479
00480 {
00481 struct wormCdnaIterator *it = *pIt;
00482 if (it != NULL)
00483 {
00484 carefulClose(&it->faFile);
00485 freez(pIt);
00486 }
00487 }
00488
00489 static boolean isAllAlpha(char *s)
00490
00491 {
00492 char c;
00493 while ((c = *s++) != 0)
00494 {
00495 if (!isalpha(c)) return FALSE;
00496 }
00497 return TRUE;
00498 }
00499
00500 static boolean isAllDigit(char *s)
00501
00502 {
00503 char c;
00504 while ((c = *s++) != 0)
00505 {
00506 if (!isdigit(c)) return FALSE;
00507 }
00508 return TRUE;
00509 }
00510
00511 boolean wormIsOrfName(char *in)
00512
00513
00514 {
00515 return strchr(in, '.') != NULL;
00516 }
00517
00518 boolean wormIsGeneName(char *name)
00519
00520
00521
00522 {
00523 char buf[128];
00524 int partCount;
00525 strncpy(buf, name, sizeof(buf));
00526 partCount = chopString(buf, "-", NULL, 0);
00527 if (partCount == 2)
00528 {
00529 char *parts[2];
00530 chopString(buf, "-", parts, 2);
00531 return isAllAlpha(parts[0]) && isAllDigit(parts[1]);
00532 }
00533 else
00534 {
00535 return FALSE;
00536 }
00537 }
00538
00539 struct slName *wormGeneToOrfNames(char *name)
00540
00541 {
00542 struct slName *synList = NULL;
00543 char synFileName[512];
00544 FILE *synFile;
00545 char lineBuf[128];
00546 int nameLen = strlen(name);
00547
00548
00549 tolowers(name);
00550
00551
00552 sprintf(synFileName, "%ssyn", wormFeaturesDir());
00553 if ((synFile = fopen(synFileName, "r")) == NULL)
00554 errAbort("Can't find synonym file '%s'. (errno: %d)\n", synFileName, errno);
00555 while (fgets(lineBuf, ArraySize(lineBuf), synFile))
00556 {
00557
00558 if (strncmp(name, lineBuf, nameLen) == 0)
00559 {
00560 char *syns[32];
00561 int count;
00562 count = chopString(lineBuf, whiteSpaceChopper, syns, ArraySize(syns));
00563
00564
00565 if (strcmp(name, syns[0]) == 0)
00566 {
00567 int i;
00568 for (i=1; i<count; ++i)
00569 slAddTail(&synList, newSlName(syns[i]));
00570 break;
00571 }
00572 }
00573 }
00574 fclose(synFile);
00575 return synList;
00576 }
00577
00578 char *wormGeneFirstOrfName(char *geneName)
00579
00580 {
00581 struct slName *synList = wormGeneToOrfNames(geneName);
00582 char *name;
00583 if (synList == NULL)
00584 return NULL;
00585 name = cloneString(synList->name);
00586 slFreeList(&synList);
00587 return name;
00588 }
00589
00590 boolean wormGeneForOrf(char *orfName, char *geneNameBuf, int bufSize)
00591
00592 {
00593 FILE *f;
00594 char fileName[512];
00595 char lineBuf[512];
00596 int nameLen = strlen(orfName);
00597 boolean ok = FALSE;
00598
00599 sprintf(fileName, "%sorf2gene", wormFeaturesDir());
00600 f = mustOpen(fileName, "r");
00601 while (fgets(lineBuf, sizeof(lineBuf), f))
00602 {
00603 if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
00604 {
00605 char *words[2];
00606 int wordCount;
00607 wordCount = chopLine(lineBuf, words);
00608 assert((int)strlen(words[1]) < bufSize);
00609 strncpy(geneNameBuf, words[1], bufSize);
00610 ok = TRUE;
00611 break;
00612 }
00613 }
00614 fclose(f);
00615 return ok;
00616 }
00617
00618 boolean wormInfoForGene(char *orfName, struct wormCdnaInfo *retInfo)
00619
00620 {
00621 FILE *f;
00622 char fileName[512];
00623 char lineBuf[512];
00624 int nameLen;
00625 char *info = NULL;
00626 char *synName = NULL;
00627 int lineCount = 0;
00628
00629
00630 if (wormIsGeneName(orfName))
00631 {
00632 synName = wormGeneFirstOrfName(orfName);
00633 if (synName != NULL)
00634 orfName = synName;
00635 }
00636 sprintf(fileName, "%sorfInfo", wormFeaturesDir());
00637 nameLen = strlen(orfName);
00638 f = mustOpen(fileName, "r");
00639 while (fgets(lineBuf, sizeof(lineBuf), f))
00640 {
00641 ++lineCount;
00642 if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
00643 {
00644 info = cloneString(lineBuf);
00645 break;
00646 }
00647 }
00648 freeMem(synName);
00649 fclose(f);
00650 if (info == NULL)
00651 return FALSE;
00652 wormFaCommentIntoInfo(info, retInfo);
00653 return TRUE;;
00654 }
00655
00656 boolean getWormGeneDna(char *name, DNA **retDna, boolean upcExons)
00657
00658 {
00659 struct gdfGene *g;
00660 struct slName *syn = NULL;
00661 long lstart, lend;
00662 int start, end;
00663 int dnaSize;
00664 DNA *dna;
00665 struct wormGdfCache *gdfCache;
00666
00667
00668 if (wormIsGeneName(name))
00669 {
00670 syn = wormGeneToOrfNames(name);
00671 if (syn != NULL)
00672 name = syn->name;
00673 }
00674 if (strncmp(name, "g-", 2) == 0)
00675 gdfCache = &wormGenieGdfCache;
00676 else
00677 gdfCache = &wormSangerGdfCache;
00678 if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
00679 return FALSE;
00680 gdfGeneExtents(g, &lstart, &lend);
00681 start = lstart;
00682 end = lend;
00683
00684 dnaSize = end-start;
00685 *retDna = dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
00686
00687 gdfOffsetGene(g, -start);
00688 if (g->strand == '-')
00689 {
00690 reverseComplement(dna, dnaSize);
00691 gdfRcGene(g, dnaSize);
00692 }
00693 if (upcExons)
00694 {
00695 int i;
00696 struct gdfDataPoint *pt = g->dataPoints;
00697 for (i=0; i<g->dataCount; i += 2)
00698 {
00699 toUpperN(dna + pt[i].start, pt[i+1].start - pt[i].start);
00700 }
00701 }
00702 gdfFreeGene(g);
00703 return TRUE;
00704 }
00705
00706 boolean getWormGeneExonDna(char *name, DNA **retDna)
00707
00708 {
00709 struct gdfGene *g;
00710 struct slName *syn = NULL;
00711 long lstart, lend;
00712 int start, end;
00713 int dnaSize;
00714 DNA *dna;
00715 int i;
00716 struct gdfDataPoint *pt = NULL;
00717 struct wormGdfCache *gdfCache;
00718 struct dyString *dy = newDyString(1000);
00719
00720 if (wormIsGeneName(name))
00721 {
00722 syn = wormGeneToOrfNames(name);
00723 if (syn != NULL)
00724 name = syn->name;
00725 }
00726 if (strncmp(name, "g-", 2) == 0)
00727 gdfCache = &wormGenieGdfCache;
00728 else
00729 gdfCache = &wormSangerGdfCache;
00730 if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
00731 return FALSE;
00732 gdfGeneExtents(g, &lstart, &lend);
00733 start = lstart;
00734 end = lend;
00735
00736 dnaSize = end-start;
00737 dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
00738
00739 gdfOffsetGene(g, -start);
00740 if (g->strand == '-')
00741 {
00742 reverseComplement(dna, dnaSize);
00743 gdfRcGene(g, dnaSize);
00744 }
00745 pt = g->dataPoints;
00746 for (i=0; i<g->dataCount; i += 2)
00747 {
00748 dyStringAppendN(dy, (dna+pt[i].start), (pt[i+1].start - pt[i].start));
00749 }
00750 *retDna = cloneString(dy->string);
00751 dyStringFree(&dy);
00752 gdfFreeGene(g);
00753 return TRUE;
00754 }
00755
00756 static void makeChromFileName(char *chromId, char *buf)
00757 {
00758 getDirs();
00759 sprintf(buf, "%s%s.nt4", nt4Dir, chromId);
00760 }
00761
00762 void wormLoadNt4Genome(struct nt4Seq ***retNt4Seq, int *retNt4Count)
00763
00764 {
00765 int count = ArraySize(chromIds);
00766 struct nt4Seq **nt4s = needMem(count*sizeof(*nt4s));
00767 int i;
00768 char fileName[512];
00769
00770 for (i=0; i<count; ++i)
00771 {
00772 makeChromFileName(chromIds[i], fileName);
00773 nt4s[i] = loadNt4(fileName, chromIds[i]);
00774 }
00775 *retNt4Seq = nt4s;
00776 *retNt4Count = count;
00777 }
00778
00779 void wormFreeNt4Genome(struct nt4Seq ***pNt4Seq)
00780
00781 {
00782 struct nt4Seq **seqs;
00783 int i;
00784 if ((seqs = *pNt4Seq) == NULL)
00785 return;
00786 for (i=0; i<ArraySize(chromIds); ++i)
00787 freeNt4(&seqs[i]);
00788 freez(pNt4Seq);
00789 }
00790
00791 int wormChromSize(char *chrom)
00792
00793 {
00794 static int sizes[ArraySize(chromIds)];
00795 int ix;
00796 int size;
00797
00798 if ((ix = wormChromIx(chrom)) < 0)
00799 errAbort("%s isn't a chromosome", chrom);
00800 size = sizes[ix];
00801
00802
00803 if (size == 0)
00804 {
00805 char fileName[512];
00806 makeChromFileName(chromIds[ix], fileName);
00807 size = sizes[ix] = nt4BaseCount(fileName);
00808 }
00809 return size;
00810 }
00811
00812
00813 DNA *wormChromPart(char *chromId, int start, int size)
00814
00815 {
00816 char fileName[512];
00817 makeChromFileName(chromId, fileName);
00818 return nt4LoadPart(fileName, start, size);
00819 }
00820
00821 DNA *wormChromPartExonsUpper(char *chromId, int start, int size)
00822
00823 {
00824 DNA *dna = wormChromPart(chromId, start, size);
00825 struct wormFeature *geneFeat = wormGenesInRange(chromId, start, start+size);
00826 struct wormFeature *feat;
00827
00828 for (feat = geneFeat; feat != NULL; feat = feat->next)
00829 {
00830 char *name = feat->name;
00831 if (!wormIsNamelessCluster(name))
00832 {
00833 struct gdfGene *gene = wormGetGdfGene(name);
00834 gdfUpcExons(gene, feat->start, dna, size, start);
00835 gdfFreeGene(gene);
00836 }
00837 }
00838 slFreeList(&geneFeat);
00839 return dna;
00840 }
00841
00842 void wormClipRangeToChrom(char *chrom, int *pStart, int *pEnd)
00843
00844 {
00845 int chromEnd = wormChromSize(chrom);
00846 int temp;
00847
00848
00849 if (*pStart > *pEnd)
00850 {
00851 temp = *pEnd;
00852 *pEnd = *pStart;
00853 *pStart = temp;
00854 }
00855
00856
00857
00858 if (*pStart < 0)
00859 {
00860 *pEnd -= *pStart;
00861 *pStart = 0;
00862 }
00863 if (*pEnd > chromEnd)
00864 {
00865 *pStart -= *pEnd - chromEnd;
00866 *pEnd = chromEnd;
00867 }
00868
00869 if (*pStart < 0)
00870 *pStart = 0;
00871 }
00872
00873 boolean wormParseChromRange(char *in, char **retChromId, int *retStart, int *retEnd)
00874
00875
00876 {
00877 char *words[5];
00878 int wordCount;
00879 char *chromId;
00880 char buf[128];
00881
00882 strncpy(buf, in, sizeof(buf));
00883 wordCount = chopString(buf, "- \t\r\n:", words, ArraySize(words));
00884 if (wordCount != 3)
00885 return FALSE;
00886 chromId = wormOfficialChromName(words[0]);
00887 if (chromId == NULL)
00888 return FALSE;
00889 if (!isdigit(words[1][0]) || !isdigit(words[2][0]))
00890 return FALSE;
00891 *retChromId = chromId;
00892 *retStart = atoi(words[1]);
00893 *retEnd = atoi(words[2]);
00894 wormClipRangeToChrom(chromId, retStart, retEnd);
00895 return TRUE;
00896 }
00897
00898 boolean wormIsChromRange(char *in)
00899
00900
00901 {
00902 char *chromId;
00903 int start, end;
00904 boolean ok;
00905 ok = wormParseChromRange(in, &chromId, &start, &end);
00906 return ok;
00907 }
00908
00909 boolean wormFixupOrfName(char *name)
00910
00911 {
00912 char *dot = strrchr(name, '.');
00913 if (dot == NULL)
00914 return FALSE;
00915 toUpperN(name, dot-name);
00916 if (!isdigit(dot[1]))
00917 return TRUE;
00918 else
00919 tolowers(dot+1);
00920 return TRUE;
00921 }
00922
00923 boolean wormIsAltSplicedName(char *name)
00924
00925 {
00926 char *dot = strrchr(name, '.');
00927 if (dot == NULL)
00928 return FALSE;
00929 if (!isdigit(dot[1]))
00930 return FALSE;
00931 return isalpha(dot[strlen(dot)-1]);
00932 }
00933
00934 static void makeIsoformBaseName(char *name)
00935 {
00936 if (wormIsAltSplicedName(name))
00937 name[strlen(name)-1] = 0;
00938 }
00939
00940 static boolean findAltSpliceRange(char *name, struct snof *snof, FILE *f,
00941 char **retChrom, int *retStart, int *retEnd, char *retStrand)
00942
00943 {
00944 char baseName[64];
00945 char bName[64];
00946 int snIx, maxIx;
00947 int start = 0x7fffffff;
00948 int end = -start;
00949 char lineBuf[128];
00950 char *words[3];
00951 int wordCount;
00952 int baseNameSize;
00953
00954 strcpy(baseName, name);
00955 makeIsoformBaseName(baseName);
00956 baseNameSize = strlen(baseName);
00957 if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
00958 return FALSE;
00959 maxIx = snofElementCount(snof);
00960 for (;snIx < maxIx; ++snIx)
00961 {
00962 long offset;
00963 char *geneName;
00964
00965 snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
00966 if (strncmp(geneName, baseName, baseNameSize) != 0)
00967 break;
00968 strcpy(bName, geneName);
00969 makeIsoformBaseName(bName);
00970 if (sameString(baseName, bName))
00971 {
00972 int s, e;
00973 fseek(f, offset, SEEK_SET);
00974 fgets(lineBuf, sizeof(lineBuf), f);
00975 wordCount = chopLine(lineBuf, words);
00976 assert(wordCount == 3);
00977 wormParseChromRange(words[0], retChrom, &s, &e);
00978 *retStrand = words[1][0];
00979 if (start > s)
00980 start = s;
00981 if (end < e)
00982 end = e;
00983 }
00984 }
00985 *retStart = start;
00986 *retEnd = end;
00987 return TRUE;
00988 }
00989
00990
00991 boolean wormGeneRange(char *name, char **retChrom, char *retStrand, int *retStart, int *retEnd)
00992
00993 {
00994 static struct snof *c2gSnof = NULL, *c2cSnof = NULL;
00995 static FILE *c2gFile = NULL, *c2cFile = NULL;
00996 long offset;
00997 char fileName[512];
00998 struct slName *syn = NULL;
00999 boolean ok;
01000
01001 if (wormIsChromRange(name))
01002 {
01003 *retStrand = '.';
01004 return wormParseChromRange(name, retChrom, retStart, retEnd);
01005 }
01006
01007 getDirs();
01008
01009
01010 if (wormIsGeneName(name))
01011 {
01012 syn = wormGeneToOrfNames(name);
01013 if (syn != NULL)
01014 {
01015 name = syn->name;
01016 }
01017 }
01018 if (wormFixupOrfName(name))
01019 {
01020 if (c2gSnof == NULL)
01021 {
01022 sprintf(fileName, "%sc2g", featDir);
01023 c2gSnof = snofMustOpen(fileName);
01024 sprintf(fileName, "%sc2g", featDir);
01025 c2gFile = mustOpen(fileName, "rb");
01026 }
01027 ok = findAltSpliceRange(name, c2gSnof, c2gFile, retChrom, retStart, retEnd, retStrand);
01028 }
01029 else
01030 {
01031 char lineBuf[128];
01032 char *words[3];
01033 int wordCount;
01034 touppers(name);
01035 if (c2cSnof == NULL)
01036 {
01037 sprintf(fileName, "%sc2c", featDir);
01038 c2cSnof = snofMustOpen(fileName);
01039 sprintf(fileName, "%sc2c", featDir);
01040 c2cFile = mustOpen(fileName, "rb");
01041 }
01042 if (!snofFindOffset(c2cSnof, name, &offset) )
01043 return FALSE;
01044 fseek(c2cFile, offset, SEEK_SET);
01045 fgets(lineBuf, sizeof(lineBuf), c2cFile);
01046 wordCount = chopLine(lineBuf, words);
01047 assert(wordCount == 3);
01048 assert(strcmp(words[2], name) == 0);
01049 assert(wormIsChromRange(words[0]));
01050 *retStrand = words[1][0];
01051 ok = wormParseChromRange(words[0], retChrom, retStart, retEnd);
01052 }
01053 slFreeList(&syn);
01054 return ok;
01055 }
01056
01057 boolean wormIsNamelessCluster(char *name)
01058
01059 {
01060 char *e = strrchr(name, '.');
01061 if (e == NULL)
01062 return FALSE;
01063 if (e[1] != 'N')
01064 return FALSE;
01065 if (!isdigit(e[2]))
01066 return FALSE;
01067 return TRUE;
01068 }
01069
01070 DNA *wormGetNamelessClusterDna(char *name)
01071
01072 {
01073 char *chrom;
01074 int start, end;
01075 char strand;
01076 if (!wormGeneRange(name, &chrom, &strand, &start, &end))
01077 errAbort("Can't find %s in database", name);
01078 return wormChromPart(chrom, start, end-start);
01079 }
01080
01081 struct wormGdfCache wormSangerGdfCache = {&sangerDir,NULL,NULL};
01082 struct wormGdfCache wormGenieGdfCache = {&genieDir,NULL,NULL};
01083 struct wormGdfCache *defaultGdfCache = &wormSangerGdfCache;
01084
01085
01086 static void wormCacheSomeGdf(struct wormGdfCache *cache)
01087
01088 {
01089 if (cache->snof == NULL)
01090 {
01091 char fileName[512];
01092 char *dir;
01093 bits32 sig;
01094 getDirs();
01095 dir = *(cache->pDir);
01096 sprintf(fileName, "%sgenes", dir);
01097 cache->snof = snofMustOpen(fileName);
01098 sprintf(fileName, "%sgenes.gdf", dir);
01099 cache->file = mustOpen(fileName, "rb");
01100 mustReadOne(cache->file, sig);
01101 if (sig != glSig)
01102 errAbort("%s is not a good file", fileName);
01103 }
01104 }
01105
01106 #if 0
01107 static void wormCacheGdf()
01108
01109 {
01110 wormCacheSomeGdf(defaultGdfCache);
01111 }
01112 #endif
01113
01114 void wormUncacheSomeGdf(struct wormGdfCache *cache)
01115
01116 {
01117 snofClose(&cache->snof);
01118 carefulClose(&cache->file);
01119 }
01120
01121 void wormUncacheGdf()
01122
01123 {
01124 wormUncacheSomeGdf(defaultGdfCache);
01125 }
01126
01127 struct gdfGene *wormGetSomeGdfGene(char *name, struct wormGdfCache *cache)
01128
01129 {
01130 long offset;
01131
01132 wormCacheSomeGdf(cache);
01133 if (!snofFindOffset(cache->snof, name, &offset) )
01134 return NULL;
01135 fseek(cache->file, offset, SEEK_SET);
01136 return gdfReadOneGene(cache->file);
01137 }
01138
01139 struct gdfGene *wormGetGdfGene(char *name)
01140
01141 {
01142 return wormGetSomeGdfGene(name, defaultGdfCache);
01143 }
01144
01145 struct gdfGene *wormGetSomeGdfGeneList(char *baseName, int baseNameSize, struct wormGdfCache *cache)
01146
01147 {
01148 int snIx;
01149 int maxIx;
01150 struct snof *snof;
01151 FILE *f;
01152 struct gdfGene *list = NULL, *el;
01153
01154 wormCacheSomeGdf(cache);
01155 snof = cache->snof;
01156 f = cache->file;
01157 if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
01158 return NULL;
01159
01160 maxIx = snofElementCount(snof);
01161 for (;snIx < maxIx; ++snIx)
01162 {
01163 long offset;
01164 char *geneName;
01165
01166 snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
01167 if (strncmp(geneName, baseName, baseNameSize) != 0)
01168 break;
01169 fseek(f, offset, SEEK_SET);
01170 el = gdfReadOneGene(f);
01171 slAddTail(&list, el);
01172 }
01173 slReverse(&list);
01174 return list;
01175 }
01176
01177 struct gdfGene *wormGetGdfGeneList(char *baseName, int baseNameSize)
01178
01179 {
01180 return wormGetSomeGdfGeneList(baseName, baseNameSize, defaultGdfCache);
01181 }
01182
01183 struct gdfGene *wormGdfGenesInRange(char *chrom, int start, int end,
01184 struct wormGdfCache *geneFinder)
01185
01186 {
01187 char *dir = NULL;
01188 struct gdfGene *gdfList = NULL, *gdf;
01189 struct wormFeature *nameList, *name;
01190
01191 if (geneFinder == &wormSangerGdfCache)
01192 dir = wormSangerDir();
01193 else if (geneFinder == &wormGenieGdfCache)
01194 dir = wormGenieDir();
01195 else
01196 errAbort("Unknown geneFinder line %d of %s", __LINE__, __FILE__);
01197
01198 nameList = wormSomeGenesInRange(chrom, start, end, dir);
01199 for (name = nameList; name != NULL; name = name->next)
01200 {
01201 char *n = name->name;
01202 if (!wormIsNamelessCluster(n))
01203 {
01204 gdf = wormGetSomeGdfGene(n, geneFinder);
01205 slAddHead(&gdfList, gdf);
01206 }
01207 }
01208 slFreeList(&nameList);
01209 slReverse(&gdfList);
01210 return gdfList;
01211 }
01212
01213