00001
00002
00003
00004
00005 #include "common.h"
00006 #include "hash.h"
00007 #include "linefile.h"
00008 #include "obscure.h"
00009 #include "dnaseq.h"
00010 #include "fa.h"
00011 #include "nib.h"
00012 #include "twoBit.h"
00013 #include "repMask.h"
00014 #include "gfClientLib.h"
00015
00016 void gfClientFileArray(char *fileName, char ***retFiles, int *retFileCount)
00017
00018
00019
00020 {
00021 boolean gotSingle = FALSE;
00022 char *buf;
00023
00024 if (nibIsFile(fileName) || twoBitIsSpec(fileName)
00025 || sameString(fileName, "stdin"))
00026 gotSingle = TRUE;
00027
00028
00029 else
00030 {
00031 FILE *f = mustOpen(fileName, "r");
00032 char c = fgetc(f);
00033 fclose(f);
00034 if (c == '>')
00035 gotSingle = TRUE;
00036 }
00037 if (gotSingle)
00038 {
00039 char **files;
00040 *retFiles = AllocArray(files, 1);
00041 files[0] = cloneString(fileName);
00042 *retFileCount = 1;
00043 return;
00044 }
00045 else
00046 {
00047 readAllWords(fileName, retFiles, retFileCount, &buf);
00048 }
00049 }
00050
00051
00052 void gfClientUnmask(struct dnaSeq *seqList)
00053
00054 {
00055 struct dnaSeq *seq;
00056 for (seq = seqList; seq != NULL; seq = seq->next)
00057 faToDna(seq->dna, seq->size);
00058 }
00059
00060 static void maskFromOut(struct dnaSeq *seqList, char *outFile, float minRepDivergence)
00061
00062
00063 {
00064 struct lineFile *lf = lineFileOpen(outFile, TRUE);
00065 struct hash *hash = newHash(0);
00066 struct dnaSeq *seq;
00067 char *line;
00068
00069 for (seq = seqList; seq != NULL; seq = seq->next)
00070 hashAdd(hash, seq->name, seq);
00071 if (!lineFileNext(lf, &line, NULL))
00072 errAbort("Empty mask file %s\n", lf->fileName);
00073 if (!startsWith("There were no", line))
00074 {
00075 if (!startsWith(" SW", line))
00076 errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
00077 if (!lineFileNext(lf, &line, NULL) || !startsWith("score", line))
00078 errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
00079 lineFileNext(lf, &line, NULL);
00080 while (lineFileNext(lf, &line, NULL))
00081 {
00082 char *words[32];
00083 struct repeatMaskOut rmo;
00084 int wordCount;
00085 int seqSize;
00086 int repSize;
00087 wordCount = chopLine(line, words);
00088 if (wordCount < 14)
00089 errAbort("%s line %d - error in repeat mask .out file\n", lf->fileName, lf->lineIx);
00090 repeatMaskOutStaticLoad(words, &rmo);
00091
00092 if (rmo.percDiv + rmo.percDel + rmo.percInc <= minRepDivergence)
00093 {
00094 if((seq = hashFindVal(hash, rmo.qName)) == NULL)
00095 errAbort("%s is in %s but not corresponding sequence file, files out of sync?\n",
00096 rmo.qName, lf->fileName);
00097 seqSize = seq->size;
00098 if (rmo.qStart <= 0 || rmo.qStart > seqSize || rmo.qEnd <= 0
00099 || rmo.qEnd > seqSize || rmo.qStart > rmo.qEnd)
00100 {
00101 warn("Repeat mask sequence out of range (%d-%d of %d in %s)\n",
00102 rmo.qStart, rmo.qEnd, seqSize, rmo.qName);
00103 if (rmo.qStart <= 0)
00104 rmo.qStart = 1;
00105 if (rmo.qEnd > seqSize)
00106 rmo.qEnd = seqSize;
00107 }
00108 repSize = rmo.qEnd - rmo.qStart + 1;
00109 if (repSize > 0)
00110 toUpperN(seq->dna + rmo.qStart - 1, repSize);
00111 }
00112 }
00113 }
00114 freeHash(&hash);
00115 lineFileClose(&lf);
00116 }
00117
00118 static void maskNucSeqList(struct dnaSeq *seqList, char *seqFileName, char *maskType,
00119 boolean hardMask, float minRepDivergence)
00120
00121
00122 {
00123 struct dnaSeq *seq;
00124 char *outFile = NULL, outNameBuf[512];
00125
00126 if (sameWord(maskType, "upper"))
00127 {
00128
00129 }
00130 else if (sameWord(maskType, "lower"))
00131 {
00132 for (seq = seqList; seq != NULL; seq = seq->next)
00133 toggleCase(seq->dna, seq->size);
00134 }
00135 else
00136 {
00137
00138 if (sameWord(maskType, "out"))
00139 {
00140 sprintf(outNameBuf, "%s.out", seqFileName);
00141 outFile = outNameBuf;
00142 }
00143 else
00144 {
00145 outFile = maskType;
00146 }
00147 gfClientUnmask(seqList);
00148 maskFromOut(seqList, outFile, minRepDivergence);
00149 }
00150 if (hardMask)
00151 {
00152 for (seq = seqList; seq != NULL; seq = seq->next)
00153 upperToN(seq->dna, seq->size);
00154 }
00155 }
00156
00157 bioSeq *gfClientSeqList(int fileCount, char *files[],
00158 boolean isProt, boolean isTransDna, char *maskType,
00159 float minRepDivergence, boolean showStatus)
00160
00161
00162
00163
00164
00165
00166
00167 {
00168 int i;
00169 char *fileName;
00170 bioSeq *seqList = NULL, *seq;
00171 boolean doMask = (maskType != NULL);
00172
00173 for (i=0; i<fileCount; ++i)
00174 {
00175 struct dnaSeq *list = NULL, sseq;
00176 ZeroVar(&sseq);
00177 fileName = files[i];
00178 if (nibIsFile(fileName))
00179 list = nibLoadAllMasked(NIB_MASK_MIXED|NIB_BASE_NAME, fileName);
00180 else if (twoBitIsSpec(fileName))
00181 list = twoBitLoadAll(fileName);
00182 else if (isProt)
00183 list = faReadAllPep(fileName);
00184 else
00185 list = faReadAllMixed(fileName);
00186
00187
00188 if (doMask)
00189 {
00190 maskNucSeqList(list, fileName, maskType, isTransDna, minRepDivergence);
00191 }
00192 else
00193 {
00194
00195 for (seq = list; seq != NULL; seq = seq->next)
00196 {
00197 if (isProt)
00198 faToProtein(seq->dna, seq->size);
00199 else
00200 faToDna(seq->dna, seq->size);
00201 }
00202 }
00203
00204 seqList = slCat(seqList, list);
00205 }
00206 if (showStatus)
00207 {
00208
00209 int count = 0;
00210 unsigned long totalSize = 0;
00211 for (seq = seqList; seq != NULL; seq = seq->next)
00212 {
00213 totalSize += seq->size;
00214 count += 1;
00215 }
00216 printf("Loaded %lu letters in %d sequences\n", totalSize, count);
00217 }
00218 return seqList;
00219 }
00220