lib/wormdna.c

Go to the documentation of this file.
00001 /* wormDna - Stuff for finding worm DNA and annotation features.
00002  * This is pretty much the heart of the cobbled-together 'database'
00003  * behind the intronerator. 
00004  *
00005  * This file is copyright 2002 Jim Kent, but license is hereby
00006  * granted for all use - public, private or commercial. */
00007 
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "dnaseq.h"
00011 #include "fa.h"
00012 #include "gdf.h"
00013 #include "nt4.h"
00014 #include "snof.h"
00015 #include "wormdna.h"
00016 #include "cda.h"
00017 #include "sig.h"
00018 #include "dystring.h"
00019 
00020 static char const rcsid[] = "$Id: wormdna.c,v 1.10 2005/04/10 14:41:26 markd Exp $";
00021 
00022 static char *jkwebDir = NULL;
00023 
00024 static char *cdnaDir = NULL;
00025 static char *featDir = NULL;
00026 static char *nt4Dir = NULL;
00027 static char *sangerDir = NULL;
00028 static char *genieDir = NULL;
00029 static char *xenoDir = NULL;
00030 
00031 static void getDirs()
00032 /* Look up the directories where our data is stored. */
00033 {
00034 if (jkwebDir == NULL)
00035     {
00036     char buf[512];
00037     
00038     /* Look up directory where directory pointer files are stored
00039      * in environment string if it's there. */
00040     if ((jkwebDir = getenv("JKWEB")) == NULL)
00041         jkwebDir = "";
00042 
00043     sprintf(buf, "%scdna.dir", jkwebDir);
00044     firstWordInFile(buf, buf, sizeof(buf));
00045     cdnaDir = cloneString(buf);
00046 
00047     sprintf(buf, "%sfeat.dir", jkwebDir);
00048     firstWordInFile(buf, buf, sizeof(buf));
00049     featDir = cloneString(buf);
00050 
00051     sprintf(buf, "%snt4.dir", jkwebDir);
00052     firstWordInFile(buf, buf, sizeof(buf));
00053     nt4Dir = cloneString(buf);
00054 
00055     sprintf(buf, "%ssanger/", featDir); 
00056     sangerDir = cloneString(buf);
00057 
00058     sprintf(buf, "%sgenie/", featDir);
00059     genieDir = cloneString(buf);
00060 
00061     sprintf(buf, "%sxeno.dir", jkwebDir);
00062     firstWordInFile(buf, buf, sizeof(buf));
00063     xenoDir = cloneString(buf);
00064     }
00065 }
00066 
00067 char *wormFeaturesDir()
00068 /* Return the features directory. (Includes trailing slash.) */
00069 {
00070 getDirs();
00071 return featDir;
00072 }
00073 
00074 char *wormChromDir()
00075 /* Return the directory with the chromosomes. */
00076 {
00077 getDirs();
00078 return nt4Dir;
00079 }
00080 
00081 char *wormCdnaDir()
00082 /* Return directory with cDNA data. */
00083 {
00084 getDirs();
00085 return cdnaDir;
00086 }
00087 
00088 char *wormSangerDir()
00089 /* Return directory with Sanger specific gene predictions. */
00090 {
00091 getDirs();
00092 return sangerDir;
00093 }
00094 
00095 char *wormGenieDir()
00096 /* Return directory with Genie specific gene predictions. */
00097 {
00098 getDirs();
00099 return genieDir;
00100 }
00101 
00102 char *wormXenoDir()
00103 /* Return directory with cross-species alignments. */
00104 {
00105 getDirs();
00106 return xenoDir;
00107 }
00108 
00109 static char *chromIds[] = {"i", "ii", "iii", "iv", "v", "x", "m", };
00110 
00111 void wormChromNames(char ***retNames, int *retNameCount)
00112 /* Get list of worm chromosome names. */
00113 {
00114 *retNames = chromIds;
00115 *retNameCount = ArraySize(chromIds);
00116 }
00117 
00118 int wormChromIx(char *name)
00119 /* Return index of worm chromosome. */
00120 {
00121 return stringIx(name, chromIds);
00122 }
00123 
00124 char *wormChromForIx(int ix)
00125 /* Given ix, return worm chromosome official name. */
00126 {
00127 assert(ix >= 0 && ix <= ArraySize(chromIds));
00128 return chromIds[ix];
00129 }
00130 
00131 char *wormOfficialChromName(char *name)
00132 /* This returns a pointer to our official string for the chromosome name.
00133  * (This allows some routines to do direct pointer comparisons rather
00134  * than string comparisons.) */
00135 {
00136 int ix = wormChromIx(name);
00137 if (ix < 0) return NULL;
00138 return chromIds[ix];
00139 }
00140 
00141 
00142 static struct snof *cdnaSnof = NULL;
00143 static FILE *cdnaFa = NULL;
00144 
00145 static void wormCdnaCache()
00146 /* Set up to read cDNAs */
00147 {
00148 getDirs();
00149 if (cdnaSnof == NULL)
00150     {
00151     char buf[512];
00152 
00153     sprintf(buf, "%s%s", cdnaDir, "allcdna");
00154     cdnaSnof = snofMustOpen(buf);
00155     sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
00156     cdnaFa = mustOpen(buf, "rb");
00157     }
00158 }
00159 
00160 void wormCdnaUncache()
00161 /* Tear down structure for reading cDNAs. */
00162 {
00163 snofClose(&cdnaSnof);
00164 carefulClose(&cdnaFa);
00165 freez(&cdnaDir);
00166 }
00167 
00168 void wormFreeCdnaInfo(struct wormCdnaInfo *ci)
00169 /* Free the mother string in the cdnaInfo.  (The info structure itself normally isn't
00170  * dynamically allocated. */
00171 {
00172 freeMem(ci->motherString);
00173 zeroBytes(ci, sizeof(*ci));
00174 }
00175 
00176 static char *realInfoString(char *s)
00177 /* Returns NULL if s is just "?", the NULL placeholder. */
00178 {
00179 if (s[0] == '?' && s[1] == 0) return NULL;
00180 return s;
00181 }
00182 
00183 static void parseRestOfCdnaInfo(char *textInfo, struct wormCdnaInfo *retInfo)
00184 /* Parse text info string into a binary structure retInfo. */
00185 {
00186 int wordCount;
00187 char *words[32];
00188 char *s;
00189 
00190 wordCount = chopString(textInfo, "|", words, ArraySize(words));
00191 if (wordCount < 8)
00192     errAbort("Expecting at least 8 fields in cDNA database, got %d", wordCount);
00193 if ((s = realInfoString(words[0])) != NULL)
00194     retInfo->orientation = s[0];
00195 retInfo->gene = realInfoString(words[1]);
00196 retInfo->product = realInfoString(words[2]);
00197 if ((s = realInfoString(words[3])) != NULL)
00198     {
00199     char *parts[2];
00200     int partCount;
00201     partCount = chopString(s, ".", parts, ArraySize(parts));
00202     if (partCount == 2)
00203         {
00204         retInfo->knowStart = retInfo->knowEnd = TRUE;
00205         if (parts[0][0] == '<')
00206             {
00207             retInfo->knowStart = FALSE;
00208             parts[0] += 1;
00209             }
00210         if (parts[1][0] == '>')
00211             {
00212             retInfo->knowEnd = FALSE;
00213             parts[1] += 1;
00214             }
00215         retInfo->cdsStart = atoi(parts[0]);
00216         retInfo->cdsEnd = atoi(parts[1]);
00217         }
00218     }
00219 if ((s = realInfoString(words[4])) != NULL)
00220     {
00221     if (sameString("embryo", s))
00222         retInfo->isEmbryonic = TRUE;
00223     else if (sameString("adult", s))
00224         retInfo->isAdult = TRUE;
00225     }
00226 if ((s = realInfoString(words[5])) != NULL)
00227     {
00228     if (sameString("herm", s))
00229         retInfo->isHermaphrodite = TRUE;
00230     else if (sameString("male", s))
00231         retInfo->isMale = TRUE;
00232     }
00233 
00234 if ((s = realInfoString(words[6])) != NULL)
00235     {
00236     /* Reserved. Unused currently */
00237     }
00238 retInfo->description = realInfoString(words[7]);
00239 }
00240 
00241 void wormFaCommentIntoInfo(char *faComment, struct wormCdnaInfo *retInfo)
00242 /* Process line from .fa file containing information about cDNA into binary
00243  * structure. */
00244 {
00245 if (retInfo)
00246     {
00247     char *s;
00248     zeroBytes(retInfo, sizeof(*retInfo));
00249     /* Separate out first word and use it as name. */
00250     s = strchr(faComment, ' ');
00251     if (s == NULL)
00252         errAbort("Expecting lots of info, just got %s", faComment);
00253     *s++ = 0;
00254     retInfo->name = faComment+1;
00255     retInfo->motherString = faComment;
00256 
00257     parseRestOfCdnaInfo(s, retInfo);
00258     }
00259 }
00260 
00261 boolean wormCdnaInfo(char *name, struct wormCdnaInfo *retInfo)
00262 /* Get info about cDNA sequence. */
00263 {
00264 char commentBuf[512];
00265 char *comment;
00266 long offset;
00267 
00268 wormCdnaCache();
00269 if (!snofFindOffset(cdnaSnof, name, &offset))
00270     return FALSE;
00271 fseek(cdnaFa, offset, SEEK_SET);
00272 fgets(commentBuf, sizeof(commentBuf), cdnaFa);
00273 if (commentBuf[0] != '>')
00274     errAbort("Expecting line starting with > in cDNA fa file.\nGot %s", commentBuf);
00275 comment = cloneString(commentBuf);
00276 wormFaCommentIntoInfo(comment, retInfo);
00277 return TRUE;
00278 }
00279 
00280 boolean wormCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo)
00281 /* Get a single worm cDNA sequence. Optionally (if retInfo is non-null) get additional
00282  * info about the sequence. */
00283 {
00284 long offset;
00285 char *faComment;
00286 char **pFaComment = (retInfo == NULL ? NULL : &faComment);
00287 
00288 wormCdnaCache();
00289 if (!snofFindOffset(cdnaSnof, name, &offset))
00290     return FALSE;
00291 fseek(cdnaFa, offset, SEEK_SET);
00292 if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna))
00293     return FALSE;
00294 wormFaCommentIntoInfo(faComment, retInfo);
00295 return TRUE;
00296 }
00297 
00298 struct wormFeature *newWormFeature(char *name, char *chrom, int start, int end, char typeByte)
00299 /* Allocate a new feature. */
00300 {
00301 int size = sizeof(struct wormFeature) + strlen(name);
00302 struct wormFeature *feat = needMem(size);
00303 feat->chrom = chrom;
00304 feat->start = start;
00305 feat->end = end;
00306 feat->typeByte = typeByte;
00307 strcpy(feat->name, name);
00308 return feat;
00309 }
00310 
00311 
00312 static struct wormFeature *scanChromOffsetFile(char *dir, char *suffix, 
00313     bits32 signature, int nameOffset, char *chromId, int start, int end,
00314     int addEnd)
00315 /* Scan a chrom.pgo or chrom.cdo file for names of things that are within
00316  * range. */
00317 {
00318 FILE *f;
00319 char fileName[512];
00320 bits32 sig, nameSize, entryCount;
00321 int entrySize;
00322 int *entry;
00323 char *name;
00324 bits32 i;
00325 struct wormFeature *list = NULL, *el;
00326 char *typePt;
00327 char typeByte;
00328 
00329 sprintf(fileName, "%s%s%s", dir, chromId, suffix);
00330 f = mustOpen(fileName, "rb");
00331 mustReadOne(f, sig);
00332 if (sig != signature)
00333     errAbort("Bad signature on %s", fileName);
00334 mustReadOne(f, entryCount);
00335 mustReadOne(f, nameSize);
00336 entrySize = nameSize + nameOffset;
00337 entry = needMem(entrySize + 1);
00338 name = (char *)entry;
00339 name += nameOffset;
00340 typePt = name-1;
00341 for (i=0; i<entryCount; ++i)
00342     {
00343     mustRead(f, entry, entrySize);
00344     if (entry[0] > end)
00345         break;
00346     if (entry[1] < start)
00347         continue;
00348     typeByte = *typePt;
00349     el = newWormFeature(name, chromId, entry[0], entry[1]+addEnd, typeByte);
00350     slAddHead(&list, el);
00351     }
00352 slReverse(&list);
00353 fclose(f);
00354 freeMem(entry);
00355 return list;
00356 }
00357 
00358 struct wormFeature *wormCdnasInRange(char *chromId, int start, int end)
00359 /* Get all cDNAs that overlap the range. freeDnaSeqList the returned
00360  * list when you are through with it. */
00361 {
00362 /* This routine looks through the .CDO files made by cdnaOff
00363  */
00364 getDirs();
00365 return scanChromOffsetFile(cdnaDir, ".cdo", cdoSig, 2*sizeof(int)+1, 
00366     chromId, start, end, 0);
00367 }
00368 
00369 struct wormFeature *wormSomeGenesInRange(char *chromId, int start, int end, char *gdfDir)
00370 /* Get info on genes that overlap range in a particular set of gene predictions. */
00371 {
00372 return scanChromOffsetFile(gdfDir, ".pgo", pgoSig, 2*sizeof(int)+1,
00373     chromId, start, end, 0);
00374 }
00375 
00376 struct wormFeature *wormGenesInRange(char *chromId, int start, int end)
00377 /* Get names of all genes that overlap the range. */
00378 {
00379 /* This routine looks through the .PGO files made by makePgo
00380  */
00381 getDirs();
00382 return wormSomeGenesInRange(chromId, start, end, sangerDir);
00383 }
00384 
00385 struct wormFeature *wormCosmidsInRange(char *chromId, int start, int end)
00386 /* Get names of all genes that overlap the range. */
00387 {
00388 /* This routine looks through the .COO files made by makePgo
00389  */
00390 getDirs();
00391 return scanChromOffsetFile(featDir, ".coo", pgoSig, 2*sizeof(int)+1,
00392     chromId, start, end, 1);
00393 }
00394 
00395 FILE *wormOpenGoodAli()
00396 /* Opens good alignment file and reads signature. 
00397  * (You can then cdaLoadOne() it.) */
00398 {
00399 char fileName[512];
00400 getDirs();
00401 sprintf(fileName, "%sgood.ali", cdnaDir);
00402 return cdaOpenVerify(fileName);
00403 }
00404 
00405 struct cdaAli *wormCdaAlisInRange(char *chromId, int start, int end)
00406 /* Return list of cdna alignments that overlap range. */
00407 {
00408 struct cdaAli *list = NULL, *el;
00409 char fileName[512];
00410 FILE *ixFile, *aliFile;
00411 bits32 sig;
00412 int s, e;
00413 long fpos;
00414 
00415 aliFile = wormOpenGoodAli();
00416 
00417 sprintf(fileName, "%s%s.alx", cdnaDir, chromId);
00418 ixFile = mustOpen(fileName, "rb");
00419 mustReadOne(ixFile, sig);
00420 if (sig != alxSig)
00421     errAbort("Bad signature on %s", fileName);
00422 
00423 for (;;)
00424     {
00425     if (!readOne(ixFile, s))
00426         break;
00427     mustReadOne(ixFile, e);
00428     mustReadOne(ixFile, fpos);
00429     if (e <= start)
00430         continue;
00431     if (s >= end)
00432         break;
00433     AllocVar(el);
00434     fseek(aliFile, fpos, SEEK_SET);
00435     el = cdaLoadOne(aliFile);
00436     if (el == NULL)
00437         errAbort("Truncated cdnaAli file");
00438     slAddHead(&list, el);
00439     }
00440 slReverse(&list);
00441 fclose(aliFile);
00442 fclose(ixFile);
00443 return list;
00444 }
00445 
00446 boolean nextWormCdnaAndInfo(struct wormCdnaIterator *it, struct dnaSeq **retSeq, 
00447     struct wormCdnaInfo *retInfo)
00448 /* Return next sequence and associated info from database. */
00449 {
00450 char *faComment;
00451 
00452 if (!faReadNext(it->faFile, "unknown", TRUE, &faComment, retSeq))
00453     return FALSE;
00454 wormFaCommentIntoInfo(faComment, retInfo);
00455 return TRUE;
00456 }
00457 
00458 struct dnaSeq *nextWormCdna(struct wormCdnaIterator *it)
00459 /* Return next sequence in database */
00460 {
00461 return faReadOneDnaSeq(it->faFile, "unknown", TRUE);
00462 }
00463 
00464 boolean wormSearchAllCdna(struct wormCdnaIterator **retSi)
00465 /* Set up to search entire database or worm cDNA */
00466 {
00467 char buf[512];
00468 struct wormCdnaIterator *it;
00469 
00470 it = needMem(sizeof(*it));
00471 getDirs();
00472 sprintf(buf, "%s%s", cdnaDir, "allcdna.fa");
00473 it->faFile = mustOpen(buf, "rb");
00474 *retSi = it;
00475 return TRUE;
00476 }
00477 
00478 void freeWormCdnaIterator(struct wormCdnaIterator **pIt)
00479 /* Free up iterator returned by wormSearchAllCdna() */
00480 {
00481 struct wormCdnaIterator *it = *pIt;
00482 if (it != NULL)
00483     {
00484     carefulClose(&it->faFile);
00485     freez(pIt);
00486     }
00487 }
00488 
00489 static boolean isAllAlpha(char *s)
00490 /* Returns TRUE if every character in string is a letter. */
00491 {
00492 char c;
00493 while ((c = *s++) != 0)
00494     {
00495     if (!isalpha(c)) return FALSE;
00496     }
00497 return TRUE;
00498 }
00499 
00500 static boolean isAllDigit(char *s)
00501 /* Returns TRUE if every character in string is a digit. */
00502 {
00503 char c;
00504 while ((c = *s++) != 0)
00505     {
00506     if (!isdigit(c)) return FALSE;
00507     }
00508 return TRUE;
00509 }
00510 
00511 boolean wormIsOrfName(char *in)
00512 /* Check to see if the input is formatted correctly to be
00513  * an ORF. */
00514 {
00515 return strchr(in, '.') != NULL;
00516 }
00517 
00518 boolean wormIsGeneName(char *name)
00519 /* See if it looks like a worm gene name - that is
00520  *   abc-12
00521  * letters followed by a dash followed by a number. */
00522 {
00523 char buf[128];
00524 int partCount;
00525 strncpy(buf, name, sizeof(buf));
00526 partCount = chopString(buf, "-", NULL, 0);
00527 if (partCount == 2)
00528     {
00529     char *parts[2];
00530     chopString(buf, "-", parts, 2);
00531     return isAllAlpha(parts[0]) && isAllDigit(parts[1]);
00532     }
00533 else
00534     {
00535     return FALSE;
00536     }
00537 }
00538 
00539 struct slName *wormGeneToOrfNames(char *name)
00540 /* Returns list of cosmid.N type ORF names that are known by abc-12 type name. */
00541 {
00542 struct slName *synList = NULL;
00543 char synFileName[512];
00544 FILE *synFile;
00545 char lineBuf[128];
00546 int nameLen = strlen(name);
00547 
00548 /* genes are supposed to be lower case. */
00549 tolowers(name);
00550 
00551 /* Open synonym file and loop through each line of it */
00552 sprintf(synFileName, "%ssyn", wormFeaturesDir());
00553 if ((synFile = fopen(synFileName, "r")) == NULL)
00554     errAbort("Can't find synonym file '%s'. (errno: %d)\n", synFileName, errno);
00555 while (fgets(lineBuf, ArraySize(lineBuf), synFile))
00556     {
00557     /* If first part of line matches chop up line. */
00558     if (strncmp(name, lineBuf, nameLen) == 0)
00559         {
00560         char *syns[32];
00561         int count;
00562         count = chopString(lineBuf, whiteSpaceChopper, syns, ArraySize(syns));
00563 
00564         /* Looks like we got a synonym.  Add all the aliases. */
00565         if (strcmp(name, syns[0]) == 0)
00566             {
00567             int i;
00568             for (i=1; i<count; ++i)
00569                 slAddTail(&synList, newSlName(syns[i]));
00570             break;
00571             }
00572         }
00573     }
00574 fclose(synFile);
00575 return synList;
00576 }
00577 
00578 char *wormGeneFirstOrfName(char *geneName)
00579 /* Return first ORF synonym to gene. */
00580 {
00581 struct slName *synList = wormGeneToOrfNames(geneName);
00582 char *name;
00583 if (synList == NULL)
00584     return NULL;
00585 name = cloneString(synList->name);
00586 slFreeList(&synList);
00587 return name;
00588 }
00589 
00590 boolean wormGeneForOrf(char *orfName, char *geneNameBuf, int bufSize)
00591 /* Look for gene type (unc-12 or something) synonym for cosmid.N name. */
00592 {
00593 FILE *f;
00594 char fileName[512];
00595 char lineBuf[512];
00596 int nameLen = strlen(orfName);
00597 boolean ok = FALSE;
00598 
00599 sprintf(fileName, "%sorf2gene", wormFeaturesDir());
00600 f = mustOpen(fileName, "r");
00601 while (fgets(lineBuf, sizeof(lineBuf), f))
00602     {
00603     if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
00604         {
00605         char *words[2];
00606         int wordCount;
00607         wordCount = chopLine(lineBuf, words);
00608         assert((int)strlen(words[1]) < bufSize);
00609         strncpy(geneNameBuf, words[1], bufSize);
00610         ok = TRUE;
00611         break;
00612         }
00613     }
00614 fclose(f);
00615 return ok;
00616 }
00617 
00618 boolean wormInfoForGene(char *orfName, struct wormCdnaInfo *retInfo)
00619 /* Return info if any on ORF, or NULL if none exists. freeMem() return value. */
00620 {
00621 FILE *f;
00622 char fileName[512];
00623 char lineBuf[512];
00624 int nameLen;
00625 char *info = NULL;
00626 char *synName = NULL;
00627 int lineCount = 0;
00628 
00629 /* Make this one work for orfs as well as gene names */
00630 if (wormIsGeneName(orfName))
00631     {
00632     synName = wormGeneFirstOrfName(orfName);
00633     if (synName != NULL)
00634         orfName = synName;
00635     }
00636 sprintf(fileName, "%sorfInfo", wormFeaturesDir());
00637 nameLen = strlen(orfName);
00638 f = mustOpen(fileName, "r");
00639 while (fgets(lineBuf, sizeof(lineBuf), f))
00640     {
00641     ++lineCount;
00642     if (strncmp(lineBuf, orfName, nameLen) == 0 && lineBuf[nameLen] == ' ')
00643         {
00644         info = cloneString(lineBuf);
00645         break;
00646         }
00647     }
00648 freeMem(synName);
00649 fclose(f);
00650 if (info == NULL)
00651     return FALSE;
00652 wormFaCommentIntoInfo(info, retInfo);
00653 return TRUE;;
00654 }
00655 
00656 boolean getWormGeneDna(char *name, DNA **retDna, boolean upcExons)
00657 /* Get the DNA associated with a gene.  Optionally upper case exons. */
00658 {
00659 struct gdfGene *g;
00660 struct slName *syn = NULL;
00661 long lstart, lend;
00662 int start, end;
00663 int dnaSize;
00664 DNA *dna;
00665 struct wormGdfCache *gdfCache;
00666 
00667 /* Translate biologist type name to cosmid.N name */
00668 if (wormIsGeneName(name))
00669     {
00670     syn = wormGeneToOrfNames(name);
00671     if (syn != NULL)
00672         name = syn->name;
00673     }
00674 if (strncmp(name, "g-", 2) == 0)
00675     gdfCache = &wormGenieGdfCache;
00676 else
00677     gdfCache = &wormSangerGdfCache;
00678 if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
00679     return FALSE;
00680 gdfGeneExtents(g, &lstart, &lend);
00681 start = lstart;
00682 end = lend;
00683 /* wormClipRangeToChrom(chromIds[g->chromIx], &start, &end); */
00684 dnaSize = end-start;
00685 *retDna = dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
00686 
00687 gdfOffsetGene(g, -start);
00688 if (g->strand == '-')
00689     {
00690     reverseComplement(dna, dnaSize);
00691     gdfRcGene(g, dnaSize);
00692     }
00693 if (upcExons)
00694     {
00695     int i;
00696     struct gdfDataPoint *pt = g->dataPoints;
00697     for (i=0; i<g->dataCount; i += 2)
00698         {
00699         toUpperN(dna + pt[i].start, pt[i+1].start - pt[i].start);
00700         }
00701     }
00702 gdfFreeGene(g);
00703 return TRUE;
00704 }
00705 
00706 boolean getWormGeneExonDna(char *name, DNA **retDna)
00707 /* Get the DNA associated with a gene, without introns.  */
00708 {
00709 struct gdfGene *g;
00710 struct slName *syn = NULL;
00711 long lstart, lend;
00712 int start, end;
00713 int dnaSize;
00714 DNA *dna;
00715 int i;
00716 struct gdfDataPoint *pt = NULL;
00717 struct wormGdfCache *gdfCache;
00718 struct dyString *dy = newDyString(1000);
00719 /* Translate biologist type name to cosmid.N name */
00720 if (wormIsGeneName(name))
00721     {
00722     syn = wormGeneToOrfNames(name);
00723     if (syn != NULL)
00724         name = syn->name;
00725     }
00726 if (strncmp(name, "g-", 2) == 0)
00727     gdfCache = &wormGenieGdfCache;
00728 else
00729     gdfCache = &wormSangerGdfCache;
00730 if ((g = wormGetSomeGdfGene(name, gdfCache)) == NULL)
00731     return FALSE;
00732 gdfGeneExtents(g, &lstart, &lend);
00733 start = lstart;
00734 end = lend;
00735 /*wormClipRangeToChrom(chromIds[g->chromIx], &start, &end);*/
00736 dnaSize = end-start;
00737 dna = wormChromPart(chromIds[g->chromIx], start, dnaSize);
00738 
00739 gdfOffsetGene(g, -start);
00740 if (g->strand == '-')
00741     {
00742     reverseComplement(dna, dnaSize);
00743     gdfRcGene(g, dnaSize);
00744     }
00745 pt = g->dataPoints;
00746 for (i=0; i<g->dataCount; i += 2)
00747     {
00748     dyStringAppendN(dy, (dna+pt[i].start), (pt[i+1].start - pt[i].start));
00749     }
00750 *retDna = cloneString(dy->string);
00751 dyStringFree(&dy);
00752 gdfFreeGene(g);
00753 return TRUE;
00754 }
00755 
00756 static void makeChromFileName(char *chromId, char *buf)
00757 {
00758 getDirs();
00759 sprintf(buf, "%s%s.nt4", nt4Dir, chromId);
00760 }
00761 
00762 void wormLoadNt4Genome(struct nt4Seq ***retNt4Seq, int *retNt4Count)
00763 /* Load up entire packed worm genome into memory. */
00764 {
00765 int count = ArraySize(chromIds);
00766 struct nt4Seq **nt4s = needMem(count*sizeof(*nt4s));
00767 int i;
00768 char fileName[512];
00769 
00770 for (i=0; i<count; ++i)
00771     {
00772     makeChromFileName(chromIds[i], fileName);
00773     nt4s[i] = loadNt4(fileName, chromIds[i]);
00774     }
00775 *retNt4Seq = nt4s;
00776 *retNt4Count = count;
00777 }
00778 
00779 void wormFreeNt4Genome(struct nt4Seq ***pNt4Seq)
00780 /* Free up packed worm genome. */
00781 {
00782 struct nt4Seq **seqs;
00783 int i;
00784 if ((seqs = *pNt4Seq) == NULL)
00785     return;
00786 for (i=0; i<ArraySize(chromIds); ++i)
00787     freeNt4(&seqs[i]);
00788 freez(pNt4Seq);
00789 }
00790 
00791 int wormChromSize(char *chrom)
00792 /* Return size of worm chromosome. */
00793 {
00794 static int sizes[ArraySize(chromIds)];
00795 int ix;
00796 int size;
00797 
00798 if ((ix = wormChromIx(chrom)) < 0)
00799     errAbort("%s isn't a chromosome", chrom);
00800 size = sizes[ix];
00801 
00802 /* If we don't know it already have to get it from file. */
00803 if (size == 0)
00804     {
00805     char fileName[512];
00806     makeChromFileName(chromIds[ix], fileName);
00807     size = sizes[ix] = nt4BaseCount(fileName);
00808     }
00809 return size;
00810 }
00811 
00812 
00813 DNA *wormChromPart(char *chromId, int start, int size)
00814 /* Return part of a worm chromosome. */
00815 {
00816 char fileName[512];
00817 makeChromFileName(chromId, fileName);
00818 return nt4LoadPart(fileName, start, size);
00819 }
00820 
00821 DNA *wormChromPartExonsUpper(char *chromId, int start, int size)
00822 /* Return part of a worm chromosome with exons in upper case. */
00823 {
00824 DNA *dna = wormChromPart(chromId, start, size);
00825 struct wormFeature *geneFeat = wormGenesInRange(chromId, start, start+size);
00826 struct wormFeature *feat;
00827 
00828 for (feat = geneFeat; feat != NULL; feat = feat->next)
00829     {
00830     char *name = feat->name;
00831     if (!wormIsNamelessCluster(name))
00832         {
00833         struct gdfGene *gene = wormGetGdfGene(name);
00834         gdfUpcExons(gene, feat->start, dna, size, start);
00835         gdfFreeGene(gene);
00836         }
00837     }
00838 slFreeList(&geneFeat);
00839 return dna;
00840 }
00841 
00842 void wormClipRangeToChrom(char *chrom, int *pStart, int *pEnd)
00843 /* Make sure that we stay inside chromosome. */
00844 {
00845 int chromEnd = wormChromSize(chrom);
00846 int temp;
00847 
00848 /* Swap ends if reversed. */
00849 if (*pStart > *pEnd)
00850     {
00851     temp = *pEnd;
00852     *pEnd = *pStart;
00853     *pStart = temp;
00854     }
00855 /* Generally speaking try to slide the range covered by
00856  * start-end inside the chromosome rather than just
00857  * truncating an end. */
00858 if (*pStart < 0)
00859     {
00860     *pEnd -= *pStart;
00861     *pStart = 0;
00862     }
00863 if (*pEnd > chromEnd)
00864     {
00865     *pStart -= *pEnd - chromEnd;
00866     *pEnd = chromEnd;
00867     }
00868 /* This handles case where the range is larger than the chromosome. */
00869 if (*pStart < 0)
00870     *pStart = 0;
00871 }
00872 
00873 boolean wormParseChromRange(char *in, char **retChromId, int *retStart, int *retEnd)
00874 /* Chop up a string representation of a range within a chromosome and put the
00875  * pieces into the return variables. Return FALSE if it isn't formatted right. */
00876 {
00877 char *words[5];
00878 int wordCount;
00879 char *chromId;
00880 char buf[128];
00881 
00882 strncpy(buf, in, sizeof(buf));
00883 wordCount = chopString(buf, "- \t\r\n:", words, ArraySize(words));
00884 if (wordCount != 3)
00885     return FALSE;
00886 chromId = wormOfficialChromName(words[0]);
00887 if (chromId == NULL)
00888     return FALSE;
00889 if (!isdigit(words[1][0]) || !isdigit(words[2][0]))
00890     return FALSE;
00891 *retChromId = chromId;
00892 *retStart = atoi(words[1]);
00893 *retEnd = atoi(words[2]);
00894 wormClipRangeToChrom(chromId, retStart, retEnd);
00895 return TRUE;
00896 }
00897 
00898 boolean wormIsChromRange(char *in)
00899 /* Check to see if the input is formatted correctly to be
00900  * a range of a chromosome. */
00901 {
00902 char *chromId;
00903 int start, end;
00904 boolean ok;
00905 ok =  wormParseChromRange(in, &chromId, &start, &end);
00906 return ok;
00907 }
00908 
00909 boolean wormFixupOrfName(char *name)
00910 /* Turn something into a proper cosmid.# style name. Return FALSE if it can't be done. */
00911 {
00912 char *dot = strrchr(name, '.');
00913 if (dot == NULL)
00914     return FALSE;
00915 toUpperN(name, dot-name);   /* First part always upper case. */
00916 if (!isdigit(dot[1]))          /* Nameless cluster - just leave following digits be. */
00917     return TRUE;
00918 else
00919     tolowers(dot+1);        /* Suffix is lower case. */
00920 return TRUE;
00921 }
00922 
00923 boolean wormIsAltSplicedName(char *name)
00924 /* Is name in right form to be an isoform? */
00925 {
00926 char *dot = strrchr(name, '.');
00927 if (dot == NULL)
00928     return FALSE;
00929 if (!isdigit(dot[1]))
00930     return FALSE;
00931 return isalpha(dot[strlen(dot)-1]);
00932 }
00933 
00934 static void makeIsoformBaseName(char *name)
00935 {
00936 if (wormIsAltSplicedName(name))
00937     name[strlen(name)-1] = 0;
00938 }
00939 
00940 static boolean findAltSpliceRange(char *name, struct snof *snof, FILE *f, 
00941     char **retChrom, int *retStart, int *retEnd, char *retStrand)
00942 /* Return range of chromosome covered by a gene and all of it's isoforms. */
00943 {
00944 char baseName[64];
00945 char bName[64];
00946 int snIx, maxIx;
00947 int start = 0x7fffffff;
00948 int end = -start;
00949 char lineBuf[128];
00950 char *words[3];
00951 int wordCount;
00952 int baseNameSize;
00953 
00954 strcpy(baseName, name);
00955 makeIsoformBaseName(baseName);
00956 baseNameSize = strlen(baseName);
00957 if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
00958     return FALSE;
00959 maxIx = snofElementCount(snof);
00960 for (;snIx < maxIx; ++snIx)
00961     {
00962     long offset;
00963     char *geneName;
00964 
00965     snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
00966     if (strncmp(geneName, baseName, baseNameSize) != 0)
00967         break;
00968     strcpy(bName, geneName);
00969     makeIsoformBaseName(bName);
00970     if (sameString(baseName, bName))
00971         {
00972         int s, e;
00973         fseek(f, offset, SEEK_SET);
00974         fgets(lineBuf, sizeof(lineBuf), f);
00975         wordCount = chopLine(lineBuf, words);
00976         assert(wordCount == 3);
00977         wormParseChromRange(words[0], retChrom, &s, &e);
00978         *retStrand = words[1][0];
00979         if (start > s)
00980             start = s;
00981         if (end < e)
00982             end = e;
00983         }
00984     }
00985 *retStart = start;
00986 *retEnd = end;
00987 return TRUE;
00988 }
00989 
00990 
00991 boolean wormGeneRange(char *name, char **retChrom, char *retStrand, int *retStart, int *retEnd)
00992 /* Return chromosome position of a chrom range, gene, ORF, cosmid, or nameless cluster. */
00993 {
00994 static struct snof *c2gSnof = NULL, *c2cSnof = NULL;
00995 static FILE *c2gFile = NULL, *c2cFile = NULL;
00996 long offset;
00997 char fileName[512];
00998 struct slName *syn = NULL;
00999 boolean ok;
01000 
01001 if (wormIsChromRange(name))
01002     {
01003     *retStrand = '.';
01004     return wormParseChromRange(name, retChrom, retStart, retEnd);
01005     }
01006 
01007 getDirs();
01008 
01009 /* Translate biologist type name to cosmid.N name */
01010 if (wormIsGeneName(name))
01011     {
01012     syn = wormGeneToOrfNames(name);
01013     if (syn != NULL)
01014         {
01015         name = syn->name;
01016         }
01017     }
01018 if (wormFixupOrfName(name)) /* See if ORF, and if so make nice. */
01019     {
01020     if (c2gSnof == NULL)
01021         {
01022         sprintf(fileName, "%sc2g", featDir);
01023         c2gSnof = snofMustOpen(fileName);
01024         sprintf(fileName, "%sc2g", featDir);
01025         c2gFile = mustOpen(fileName, "rb");
01026         }
01027     ok = findAltSpliceRange(name, c2gSnof, c2gFile, retChrom, retStart, retEnd, retStrand);
01028     }
01029 else    /* Lets say it's a cosmid. */
01030     {
01031     char lineBuf[128];
01032     char *words[3];
01033     int wordCount;
01034     touppers(name);
01035     if (c2cSnof == NULL)
01036         {
01037         sprintf(fileName, "%sc2c", featDir);
01038         c2cSnof = snofMustOpen(fileName);
01039         sprintf(fileName, "%sc2c", featDir);
01040         c2cFile = mustOpen(fileName, "rb");
01041         }
01042     if (!snofFindOffset(c2cSnof, name, &offset) )
01043         return FALSE;
01044     fseek(c2cFile, offset, SEEK_SET);
01045     fgets(lineBuf, sizeof(lineBuf), c2cFile);
01046     wordCount = chopLine(lineBuf, words);
01047     assert(wordCount == 3);
01048     assert(strcmp(words[2], name) == 0);
01049     assert(wormIsChromRange(words[0]));
01050     *retStrand = words[1][0];
01051     ok = wormParseChromRange(words[0], retChrom, retStart, retEnd);
01052     }
01053 slFreeList(&syn);
01054 return ok;
01055 }
01056 
01057 boolean wormIsNamelessCluster(char *name)
01058 /* Returns true if name is of correct format to be a nameless cluster. */
01059 {
01060 char *e = strrchr(name, '.');
01061 if (e == NULL)
01062     return FALSE;
01063 if (e[1] != 'N')
01064     return FALSE;
01065 if (!isdigit(e[2]))
01066     return FALSE;
01067 return TRUE;
01068 }
01069 
01070 DNA *wormGetNamelessClusterDna(char *name)
01071 /* Get DNA associated with nameless cluster */
01072 {
01073 char *chrom;
01074 int start, end;
01075 char strand;
01076 if (!wormGeneRange(name, &chrom, &strand, &start, &end))
01077     errAbort("Can't find %s in database", name);
01078 return wormChromPart(chrom, start, end-start);
01079 }
01080 
01081 struct wormGdfCache wormSangerGdfCache = {&sangerDir,NULL,NULL};
01082 struct wormGdfCache wormGenieGdfCache = {&genieDir,NULL,NULL};
01083 struct wormGdfCache *defaultGdfCache = &wormSangerGdfCache;
01084 
01085 
01086 static void wormCacheSomeGdf(struct wormGdfCache *cache)
01087 /* Cache one gene prediction set. */
01088 {
01089 if (cache->snof == NULL)
01090     {
01091     char fileName[512];
01092     char *dir;
01093     bits32 sig;
01094     getDirs();
01095     dir = *(cache->pDir);
01096     sprintf(fileName, "%sgenes", dir);
01097     cache->snof = snofMustOpen(fileName);
01098     sprintf(fileName, "%sgenes.gdf", dir);
01099     cache->file = mustOpen(fileName, "rb");
01100     mustReadOne(cache->file, sig);
01101     if (sig != glSig)
01102         errAbort("%s is not a good file", fileName);
01103     }
01104 }
01105 
01106 #if 0 /* unused */
01107 static void wormCacheGdf()
01108 /* Set up for fast access to GDF file entries. */
01109 {
01110 wormCacheSomeGdf(defaultGdfCache);
01111 }
01112 #endif
01113 
01114 void wormUncacheSomeGdf(struct wormGdfCache *cache)
01115 /* Uncache some gene prediction set. */
01116 {
01117 snofClose(&cache->snof);
01118 carefulClose(&cache->file);
01119 }
01120 
01121 void wormUncacheGdf()
01122 /* Free up resources associated with fast GDF access. */
01123 {
01124 wormUncacheSomeGdf(defaultGdfCache);
01125 }
01126 
01127 struct gdfGene *wormGetSomeGdfGene(char *name, struct wormGdfCache *cache)
01128 /* Get a single gdfGene of given name. */
01129 {
01130 long offset;
01131 
01132 wormCacheSomeGdf(cache);
01133 if (!snofFindOffset(cache->snof, name, &offset) )
01134     return NULL;
01135 fseek(cache->file, offset, SEEK_SET);
01136 return gdfReadOneGene(cache->file);
01137 }
01138 
01139 struct gdfGene *wormGetGdfGene(char *name)
01140 /* Get a single gdfGene of given name. */
01141 {
01142 return wormGetSomeGdfGene(name, defaultGdfCache);
01143 }
01144 
01145 struct gdfGene *wormGetSomeGdfGeneList(char *baseName, int baseNameSize, struct wormGdfCache *cache)
01146 /* Get all gdfGenes that start with a given name. */
01147 {
01148 int snIx;
01149 int maxIx;
01150 struct snof *snof;
01151 FILE *f;
01152 struct gdfGene *list = NULL, *el;
01153 
01154 wormCacheSomeGdf(cache);
01155 snof = cache->snof;
01156 f = cache->file;
01157 if (!snofFindFirstStartingWith(snof, baseName, baseNameSize, &snIx))
01158     return NULL;
01159 
01160 maxIx = snofElementCount(snof);
01161 for (;snIx < maxIx; ++snIx)
01162     {
01163     long offset;
01164     char *geneName;
01165 
01166     snofNameOffsetAtIx(snof, snIx, &geneName, &offset);
01167     if (strncmp(geneName, baseName, baseNameSize) != 0)
01168         break;
01169     fseek(f, offset, SEEK_SET);
01170     el = gdfReadOneGene(f);
01171     slAddTail(&list, el);
01172     }
01173 slReverse(&list);
01174 return list;
01175 }
01176 
01177 struct gdfGene *wormGetGdfGeneList(char *baseName, int baseNameSize)
01178 /* Get all gdfGenes that start with a given name. */
01179 {
01180 return wormGetSomeGdfGeneList(baseName, baseNameSize, defaultGdfCache);
01181 }
01182 
01183 struct gdfGene *wormGdfGenesInRange(char *chrom, int start, int end, 
01184     struct wormGdfCache *geneFinder)
01185 /* Get list of genes in range according to given gene finder. */
01186 {
01187 char *dir = NULL;
01188 struct gdfGene *gdfList = NULL, *gdf;
01189 struct wormFeature *nameList, *name;
01190 
01191 if (geneFinder == &wormSangerGdfCache)
01192     dir = wormSangerDir();
01193 else if (geneFinder == &wormGenieGdfCache)
01194     dir = wormGenieDir();
01195 else
01196     errAbort("Unknown geneFinder line %d of %s", __LINE__, __FILE__);
01197 
01198 nameList = wormSomeGenesInRange(chrom, start, end, dir);
01199 for (name = nameList; name != NULL; name = name->next)
01200     {
01201     char *n = name->name;
01202     if (!wormIsNamelessCluster(n))
01203         {
01204         gdf = wormGetSomeGdfGene(n, geneFinder);
01205         slAddHead(&gdfList, gdf);
01206         }
01207     }
01208 slFreeList(&nameList);
01209 slReverse(&gdfList);
01210 return gdfList;
01211 }
01212 
01213 

Generated on Tue Dec 25 18:39:32 2007 for blat by  doxygen 1.5.2