jkOwnLib/gfClientLib.c File Reference

#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "obscure.h"
#include "dnaseq.h"
#include "fa.h"
#include "nib.h"
#include "twoBit.h"
#include "repMask.h"
#include "gfClientLib.h"

Include dependency graph for gfClientLib.c:

Go to the source code of this file.

Functions

void gfClientFileArray (char *fileName, char ***retFiles, int *retFileCount)
void gfClientUnmask (struct dnaSeq *seqList)
static void maskFromOut (struct dnaSeq *seqList, char *outFile, float minRepDivergence)
static void maskNucSeqList (struct dnaSeq *seqList, char *seqFileName, char *maskType, boolean hardMask, float minRepDivergence)
bioSeqgfClientSeqList (int fileCount, char *files[], boolean isProt, boolean isTransDna, char *maskType, float minRepDivergence, boolean showStatus)


Function Documentation

void gfClientFileArray ( char *  fileName,
char ***  retFiles,
int *  retFileCount 
)

Definition at line 16 of file gfClientLib.c.

References AllocArray, cloneString(), FALSE, mustOpen(), nibIsFile(), readAllWords(), sameString, TRUE, and twoBitIsSpec().

Referenced by blat().

00020 {
00021 boolean gotSingle = FALSE;
00022 char *buf;              /* This will leak memory but won't matter. */
00023 
00024 if (nibIsFile(fileName) || twoBitIsSpec(fileName)
00025     || sameString(fileName, "stdin"))
00026     gotSingle = TRUE;
00027 /* Detect .fa files (where suffix is not standardized)
00028  * by first character being a '>'. */
00029 else
00030     {
00031     FILE *f = mustOpen(fileName, "r");
00032     char c = fgetc(f);
00033     fclose(f);
00034     if (c == '>')
00035         gotSingle = TRUE;
00036     }
00037 if (gotSingle)
00038     {
00039     char **files;
00040     *retFiles = AllocArray(files, 1);
00041     files[0] = cloneString(fileName);
00042     *retFileCount = 1;
00043     return;
00044     }
00045 else
00046     {
00047     readAllWords(fileName, retFiles, retFileCount, &buf);
00048     }
00049 }

Here is the call graph for this function:

Here is the caller graph for this function:

bioSeq* gfClientSeqList ( int  fileCount,
char *  files[],
boolean  isProt,
boolean  isTransDna,
char *  maskType,
float  minRepDivergence,
boolean  showStatus 
)

Definition at line 157 of file gfClientLib.c.

References doMask, faReadAllMixed(), faReadAllPep(), faToDna(), faToProtein(), maskNucSeqList(), dnaSeq::next, NIB_BASE_NAME, NIB_MASK_MIXED, nibIsFile(), nibLoadAllMasked(), slCat(), twoBitIsSpec(), twoBitLoadAll(), and ZeroVar.

Referenced by blat().

00167 {
00168 int i;
00169 char *fileName;
00170 bioSeq *seqList = NULL, *seq;
00171 boolean doMask = (maskType != NULL);
00172 
00173 for (i=0; i<fileCount; ++i)
00174     {
00175     struct dnaSeq *list = NULL, sseq;
00176     ZeroVar(&sseq);
00177     fileName = files[i];
00178     if (nibIsFile(fileName))
00179         list = nibLoadAllMasked(NIB_MASK_MIXED|NIB_BASE_NAME, fileName);
00180     else if (twoBitIsSpec(fileName))
00181         list = twoBitLoadAll(fileName);
00182     else if (isProt)
00183       list = faReadAllPep(fileName);
00184     else
00185       list = faReadAllMixed(fileName);
00186 
00187     /* If necessary mask sequence from file. */
00188     if (doMask)
00189         {
00190         maskNucSeqList(list, fileName, maskType, isTransDna, minRepDivergence);
00191         }
00192     else
00193         {
00194         /* If not masking send everything to proper case here. */
00195         for (seq = list; seq != NULL; seq = seq->next)
00196             {
00197             if (isProt)
00198                 faToProtein(seq->dna, seq->size);
00199             else
00200                 faToDna(seq->dna, seq->size);
00201             }
00202         }
00203     /* Move local list to end of bigger list. */
00204     seqList = slCat(seqList, list);
00205     }
00206 if (showStatus)
00207     {
00208     /* Total up size and sequence count and report. */
00209     int count = 0; 
00210     unsigned long totalSize = 0;
00211     for (seq = seqList; seq != NULL; seq = seq->next)
00212         {
00213         totalSize += seq->size;
00214         count += 1;
00215         }
00216     printf("Loaded %lu letters in %d sequences\n", totalSize, count);
00217     }
00218 return seqList;
00219 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfClientUnmask ( struct dnaSeq seqList  ) 

Definition at line 52 of file gfClientLib.c.

References dnaSeq::dna, faToDna(), dnaSeq::next, and dnaSeq::size.

Referenced by blat(), and maskNucSeqList().

00054 {
00055 struct dnaSeq *seq;
00056 for (seq = seqList; seq != NULL; seq = seq->next)
00057     faToDna(seq->dna, seq->size);
00058 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void maskFromOut ( struct dnaSeq seqList,
char *  outFile,
float  minRepDivergence 
) [static]

Definition at line 60 of file gfClientLib.c.

References chopLine, dnaSeq::dna, errAbort(), lineFile::fileName, freeHash(), hashAdd(), hashFindVal(), lineFileClose(), lineFileNext(), lineFileOpen(), dnaSeq::name, newHash(), dnaSeq::next, repeatMaskOut::percDel, repeatMaskOut::percDiv, repeatMaskOut::percInc, repeatMaskOut::qEnd, repeatMaskOut::qName, repeatMaskOut::qStart, repeatMaskOutStaticLoad(), dnaSeq::size, startsWith(), toUpperN(), TRUE, and warn().

Referenced by maskNucSeqList().

00063 {
00064 struct lineFile *lf = lineFileOpen(outFile, TRUE);
00065 struct hash *hash = newHash(0);
00066 struct dnaSeq *seq;
00067 char *line;
00068 
00069 for (seq = seqList; seq != NULL; seq = seq->next)
00070     hashAdd(hash, seq->name, seq);
00071 if (!lineFileNext(lf, &line, NULL))
00072     errAbort("Empty mask file %s\n", lf->fileName);
00073 if (!startsWith("There were no", line)) /* No repeats is ok. Not much work. */
00074     {
00075     if (!startsWith("   SW", line))
00076         errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
00077     if (!lineFileNext(lf, &line, NULL) || !startsWith("score", line))
00078         errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
00079     lineFileNext(lf, &line, NULL);  /* Blank line. */
00080     while (lineFileNext(lf, &line, NULL))
00081         {
00082         char *words[32];
00083         struct repeatMaskOut rmo;
00084         int wordCount;
00085         int seqSize;
00086         int repSize;
00087         wordCount = chopLine(line, words);
00088         if (wordCount < 14)
00089             errAbort("%s line %d - error in repeat mask .out file\n", lf->fileName, lf->lineIx);
00090         repeatMaskOutStaticLoad(words, &rmo);
00091         /* If repeat is more than 15% divergent don't worry about it. */
00092         if (rmo.percDiv + rmo.percDel + rmo.percInc <= minRepDivergence)
00093             {
00094             if((seq = hashFindVal(hash, rmo.qName)) == NULL)
00095                 errAbort("%s is in %s but not corresponding sequence file, files out of sync?\n", 
00096                         rmo.qName, lf->fileName);
00097             seqSize = seq->size;
00098             if (rmo.qStart <= 0 || rmo.qStart > seqSize || rmo.qEnd <= 0 
00099                 || rmo.qEnd > seqSize || rmo.qStart > rmo.qEnd)
00100                 {
00101                 warn("Repeat mask sequence out of range (%d-%d of %d in %s)\n",
00102                     rmo.qStart, rmo.qEnd, seqSize, rmo.qName);
00103                 if (rmo.qStart <= 0)
00104                     rmo.qStart = 1;
00105                 if (rmo.qEnd > seqSize)
00106                     rmo.qEnd = seqSize;
00107                 }
00108             repSize = rmo.qEnd - rmo.qStart + 1;
00109             if (repSize > 0)
00110                 toUpperN(seq->dna + rmo.qStart - 1, repSize);
00111             }
00112         }
00113     }
00114 freeHash(&hash);
00115 lineFileClose(&lf);
00116 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void maskNucSeqList ( struct dnaSeq seqList,
char *  seqFileName,
char *  maskType,
boolean  hardMask,
float  minRepDivergence 
) [static]

Definition at line 118 of file gfClientLib.c.

References dnaSeq::dna, gfClientUnmask(), maskFromOut(), dnaSeq::next, sameWord, dnaSeq::size, toggleCase(), and upperToN().

Referenced by gfClientSeqList().

00122 {
00123 struct dnaSeq *seq;
00124 char *outFile = NULL, outNameBuf[512];
00125 
00126 if (sameWord(maskType, "upper"))
00127     {
00128     /* Already has dna to be masked in upper case. */
00129     }
00130 else if (sameWord(maskType, "lower"))
00131     {
00132     for (seq = seqList; seq != NULL; seq = seq->next)
00133         toggleCase(seq->dna, seq->size);
00134     }
00135 else
00136     {
00137     /* Masking from a RepeatMasker .out file. */
00138     if (sameWord(maskType, "out"))
00139         {
00140         sprintf(outNameBuf, "%s.out", seqFileName);
00141         outFile = outNameBuf;
00142         }
00143     else
00144         {
00145         outFile = maskType;
00146         }
00147     gfClientUnmask(seqList);
00148     maskFromOut(seqList, outFile, minRepDivergence);
00149     }
00150 if (hardMask)
00151     {
00152     for (seq = seqList; seq != NULL; seq = seq->next)
00153         upperToN(seq->dna, seq->size);
00154     }
00155 }

Here is the call graph for this function:

Here is the caller graph for this function:


Generated on Tue Dec 25 19:27:36 2007 for blat by  doxygen 1.5.2