00001
00002
00003
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "options.h"
00008 #include "dystring.h"
00009 #include "fa.h"
00010 #include "net.h"
00011 #include "genoFind.h"
00012 #include "sqlNum.h"
00013 #include "gfInternal.h"
00014 #include "gfPcrLib.h"
00015
00016 static char const rcsid[] = "$Id: gfPcrLib.c,v 1.8 2006/03/15 18:36:16 angie Exp $";
00017
00018
00019
00020 void gfPcrOutputFree(struct gfPcrOutput **pOut)
00021
00022 {
00023 struct gfPcrOutput *out = *pOut;
00024 if (pOut != NULL)
00025 {
00026 freeMem(out->name);
00027 freeMem(out->fPrimer);
00028 freeMem(out->rPrimer);
00029 freeMem(out->seqName);
00030 freeMem(out->dna);
00031 freez(pOut);
00032 }
00033 }
00034
00035 void gfPcrOutputFreeList(struct gfPcrOutput **pList)
00036
00037 {
00038 struct gfPcrOutput *el, *next;
00039
00040 for (el = *pList; el != NULL; el = next)
00041 {
00042 next = el->next;
00043 gfPcrOutputFree(&el);
00044 }
00045 *pList = NULL;
00046 }
00047
00048 void gfPcrInputStaticLoad(char **row, struct gfPcrInput *ret)
00049
00050
00051 {
00052 ret->name = row[0];
00053 ret->fPrimer = row[1];
00054 ret->rPrimer = row[2];
00055 }
00056
00057 struct gfPcrInput *gfPcrInputLoad(char **row)
00058
00059
00060 {
00061 struct gfPcrInput *ret;
00062
00063 AllocVar(ret);
00064 ret->name = cloneString(row[0]);
00065 ret->fPrimer = cloneString(row[1]);
00066 ret->rPrimer = cloneString(row[2]);
00067 return ret;
00068 }
00069
00070 struct gfPcrInput *gfPcrInputLoadAll(char *fileName)
00071
00072
00073 {
00074 struct gfPcrInput *list = NULL, *el;
00075 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00076 char *row[3];
00077
00078 while (lineFileRow(lf, row))
00079 {
00080 el = gfPcrInputLoad(row);
00081 slAddHead(&list, el);
00082 }
00083 lineFileClose(&lf);
00084 slReverse(&list);
00085 return list;
00086 }
00087
00088
00089 void gfPcrInputFree(struct gfPcrInput **pEl)
00090
00091
00092 {
00093 struct gfPcrInput *el;
00094
00095 if ((el = *pEl) == NULL) return;
00096 freeMem(el->name);
00097 freeMem(el->fPrimer);
00098 freeMem(el->rPrimer);
00099 freez(pEl);
00100 }
00101
00102 void gfPcrInputFreeList(struct gfPcrInput **pList)
00103
00104 {
00105 struct gfPcrInput *el, *next;
00106
00107 for (el = *pList; el != NULL; el = next)
00108 {
00109 next = el->next;
00110 gfPcrInputFree(&el);
00111 }
00112 *pList = NULL;
00113 }
00114
00115
00116
00117
00118
00119 static boolean goodMatch(char *a, char *b, int size)
00120
00121 {
00122 int score = 0, i;
00123 for (i=0; i<size; ++i)
00124 {
00125 if (a[i] == b[i])
00126 score += 1;
00127 else
00128 score -= 2;
00129 }
00130 return score >= 0;
00131 }
00132
00133 static void upperMatch(char *dna, char *primer, int size)
00134
00135 {
00136 int i;
00137 for (i=0; i<size; ++i)
00138 {
00139 if (dna[i] == primer[i])
00140 dna[i] = toupper(dna[i]);
00141 }
00142 }
00143
00144 static void outputFa(struct gfPcrOutput *out, FILE *f, char *url)
00145
00146 {
00147 int fPrimerSize = strlen(out->fPrimer);
00148 int rPrimerSize = strlen(out->rPrimer);
00149 int productSize = out->rPos - out->fPos;
00150 char *dna = cloneStringZ(out->dna, productSize);
00151 char *rrPrimer = cloneString(out->rPrimer);
00152 char *ffPrimer = cloneString(out->fPrimer);
00153 struct dyString *faLabel = newDyString(0);
00154 char *name = out->name;
00155
00156
00157 touppers(rrPrimer);
00158 touppers(ffPrimer);
00159 if (url != NULL)
00160 {
00161 dyStringAppend(faLabel, "<A HREF=\"");
00162 dyStringPrintf(faLabel, url, out->seqName, out->fPos+1, out->rPos);
00163 dyStringAppend(faLabel, "\">");
00164 }
00165 dyStringPrintf(faLabel, "%s:%d%c%d",
00166 out->seqName, out->fPos+1, out->strand, out->rPos);
00167 if (url != NULL)
00168 dyStringAppend(faLabel, "</A>");
00169 if (name != NULL)
00170 dyStringPrintf(faLabel, " %s", name);
00171 dyStringPrintf(faLabel, " %dbp %s %s",
00172 out->rPos - out->fPos, ffPrimer, rrPrimer);
00173
00174
00175 reverseComplement(rrPrimer, rPrimerSize);
00176 tolowers(rrPrimer);
00177
00178
00179 upperMatch(dna, out->fPrimer, fPrimerSize);
00180 upperMatch(dna + productSize - rPrimerSize, rrPrimer, rPrimerSize);
00181 faWriteNext(f, faLabel->string, dna, productSize);
00182
00183
00184 freez(&dna);
00185 freez(&rrPrimer);
00186 freez(&ffPrimer);
00187 dyStringFree(&faLabel)
00188 }
00189
00190 static int countMatch(char *a, char *b, int size)
00191
00192 {
00193 int count = 0, i;
00194 for (i=0; i<size; ++i)
00195 if (a[i] == b[i])
00196 ++count;
00197 return count;
00198 }
00199
00200 static void outputBed(struct gfPcrOutput *out, FILE *f, char *url)
00201
00202 {
00203 int match;
00204 int size = out->rPos - out->fPos;
00205 int fPrimerSize = strlen(out->fPrimer);
00206 int rPrimerSize = strlen(out->rPrimer);
00207 char *name = out->name;
00208 if (name == NULL) name = "n/a";
00209 fprintf(f, "%s\t%d\t%d\t", out->seqName, out->fPos, out->rPos);
00210 fprintf(f, "%s\t", name);
00211 match = countMatch(out->dna, out->fPrimer, fPrimerSize);
00212 assert(size > 0);
00213 reverseComplement(out->rPrimer, rPrimerSize);
00214 match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
00215 reverseComplement(out->rPrimer, rPrimerSize);
00216 fprintf(f, "%d\t", round(1000.0 * match / (double)(fPrimerSize + rPrimerSize) ));
00217 fprintf(f, "%c\n", out->strand);
00218 }
00219
00220 static void outputPsl(struct gfPcrOutput *out, FILE *f, char *url)
00221
00222 {
00223 int match;
00224 int size = out->rPos - out->fPos;
00225 int fPrimerSize = strlen(out->fPrimer);
00226 int rPrimerSize = strlen(out->rPrimer);
00227 int bothSize = fPrimerSize + rPrimerSize;
00228 int gapSize = size - bothSize;
00229 char *name = out->name;
00230 if (name == NULL) name = "n/a";
00231 match = countMatch(out->dna, out->fPrimer, fPrimerSize);
00232 reverseComplement(out->rPrimer, rPrimerSize);
00233 assert(size > 0);
00234 match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
00235 reverseComplement(out->rPrimer, rPrimerSize);
00236
00237 fprintf(f, "%d\t", match);
00238 fprintf(f, "%d\t", bothSize - match);
00239 fprintf(f, "0\t0\t");
00240 fprintf(f, "1\t%d\t", gapSize);
00241 fprintf(f, "1\t%d\t", gapSize);
00242 fprintf(f, "%c\t", out->strand);
00243 fprintf(f, "%s\t", name);
00244 fprintf(f, "%d\t", size);
00245 fprintf(f, "0\t%d\t", size);
00246 fprintf(f, "%s\t%d\t", out->seqName, out->seqSize);
00247 fprintf(f, "%d\t%d\t", out->fPos, out->rPos);
00248 fprintf(f, "2\t");
00249 fprintf(f, "%d,%d,\t", fPrimerSize, rPrimerSize);
00250 fprintf(f, "%d,%d,\t", 0,size - rPrimerSize);
00251 fprintf(f, "%d,%d,\n", out->fPos, out->rPos - rPrimerSize);
00252 }
00253
00254
00255 void gfPcrOutputWriteList(struct gfPcrOutput *outList, char *outType,
00256 char *url, FILE *f)
00257
00258
00259
00260 {
00261 struct gfPcrOutput *out;
00262 void (*output)(struct gfPcrOutput *out, FILE *f, char *url) = NULL;
00263 if (sameWord(outType, "fa"))
00264 output = outputFa;
00265 else if (sameWord(outType, "bed"))
00266 output = outputBed;
00267 else if (sameWord(outType, "psl"))
00268 output = outputPsl;
00269 else
00270 errAbort("Unrecognized pcr output type %s", outType);
00271 for (out = outList; out != NULL; out = out->next)
00272 output(out, f, url);
00273 }
00274
00275 void gfPcrOutputWriteAll(struct gfPcrOutput *outList,
00276 char *outType, char *url, char *fileName)
00277
00278
00279
00280 {
00281 FILE *f = mustOpen(fileName, "w");
00282 gfPcrOutputWriteList(outList, outType, url, f);
00283 carefulClose(&f);
00284 }
00285
00286
00287 static void pcrLocalStrand(char *pcrName,
00288 struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize,
00289 int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
00290 int minPerfect, int minGood,
00291 char strand, struct gfPcrOutput **pOutList)
00292
00293 {
00294 char *fDna = seq->dna, *rDna;
00295 char *endDna = fDna + seq->size;
00296 char *fpPerfect = fPrimer + fPrimerSize - minPerfect;
00297 char *rpPerfect = rPrimer;
00298 char *fpMatch, *rpMatch;
00299 int sizeLeft;
00300 int goodSize = minGood - minPerfect;
00301 struct gfPcrOutput *out;
00302 int matchSize;
00303
00304 reverseComplement(rPrimer, rPrimerSize);
00305 for (;;)
00306 {
00307 fpMatch = memMatch(fpPerfect, minPerfect, fDna, endDna - fDna);
00308 if (fpMatch == NULL)
00309 break;
00310 sizeLeft = endDna - fpMatch;
00311 rDna = fpMatch;
00312 for (;;)
00313 {
00314 int fPos, rPos, fGoodPos, rGoodPos;
00315 rpMatch = memMatch(rpPerfect, minPerfect, rDna, endDna - rDna);
00316 if (rpMatch == NULL)
00317 break;
00318 fPos = fpMatch - (fPrimerSize - minPerfect) - seq->dna;
00319 rPos = rpMatch + rPrimerSize - seq->dna;
00320 fGoodPos = fpMatch - goodSize - seq->dna;
00321 rGoodPos = rpMatch + minPerfect - seq->dna;
00322 matchSize = rPos - fPos;
00323 if (rPos >= 0 && fPos >= 0 && fPos < seq->size && matchSize <= maxSize)
00324 {
00325
00326 if (goodMatch(seq->dna + fGoodPos, fpPerfect - goodSize, goodSize) &&
00327 goodMatch(seq->dna + rGoodPos, rpPerfect + minPerfect, goodSize))
00328 {
00329 AllocVar(out);
00330 out->name = cloneString(pcrName);
00331 out->fPrimer = cloneString(fPrimer);
00332 out->rPrimer = cloneString(rPrimer);
00333 reverseComplement(out->rPrimer, rPrimerSize);
00334 out->seqName = cloneString(seqName);
00335 out->seqSize = seqSize;
00336 out->fPos = fPos + seqOffset;
00337 out->rPos = rPos + seqOffset;
00338 out->strand = strand;
00339 out->dna = cloneStringZ(seq->dna + fPos, matchSize);
00340
00341
00342
00343
00344 if (strand == '-')
00345 {
00346 char *temp = out->rPrimer;
00347 out->rPrimer = out->fPrimer;
00348 out->fPrimer = temp;
00349 reverseComplement(out->dna, matchSize);
00350 }
00351 slAddHead(pOutList, out);
00352 }
00353 }
00354 rDna = rpMatch+1;
00355 }
00356 fDna = fpMatch + 1;
00357 }
00358 reverseComplement(rPrimer, rPrimerSize);
00359 }
00360
00361 void gfPcrLocal(char *pcrName,
00362 struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize,
00363 int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
00364 int minPerfect, int minGood, char strand, struct gfPcrOutput **pOutList)
00365
00366
00367 {
00368
00369
00370 if (strand == '-')
00371 pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize,
00372 rPrimer, rPrimerSize, fPrimer, fPrimerSize,
00373 minPerfect, minGood, strand, pOutList);
00374 else
00375 pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize,
00376 fPrimer, fPrimerSize, rPrimer, rPrimerSize,
00377 minPerfect, minGood, strand, pOutList);
00378 }
00379
00380 struct gfRange *gfPcrGetRanges(char *host, char *port, char *fPrimer, char *rPrimer,
00381 int maxSize)
00382
00383 {
00384 char buf[256];
00385 int conn = gfConnect(host, port);
00386 struct gfRange *rangeList = NULL, *range;
00387
00388
00389 safef(buf, sizeof(buf), "%spcr %s %s %d", gfSignature(), fPrimer, rPrimer, maxSize);
00390 write(conn, buf, strlen(buf));
00391 for (;;)
00392 {
00393 if (netGetString(conn, buf) == NULL)
00394 break;
00395 if (sameString(buf, "end"))
00396 break;
00397 else if (startsWith("Error:", buf))
00398 errAbort(buf);
00399 else
00400 {
00401 char *s = buf;
00402 char *name, *start, *end, *strand;
00403 name = nextWord(&s);
00404 start = nextWord(&s);
00405 end = nextWord(&s);
00406 strand = nextWord(&s);
00407 if (strand == NULL)
00408 errAbort("Truncated gfServer response");
00409 AllocVar(range);
00410 range->tName = cloneString(name);
00411 range->tStart = atoi(start);
00412 range->tEnd = atoi(end);
00413 range->tStrand = strand[0];
00414 slAddHead(&rangeList, range);
00415 }
00416 }
00417 close(conn);
00418 slReverse(&rangeList);
00419 return rangeList;
00420 }
00421
00422 static void gfPcrOneViaNet(
00423 char *host, char *port, char *seqDir,
00424 char *pcrName, char *fPrimer, char *rPrimer, int maxSize,
00425 int minPerfect, int minGood,
00426 struct hash *tFileCache, struct gfPcrOutput **pOutList)
00427
00428
00429 {
00430 struct gfRange *rangeList = NULL, *range;
00431 int fPrimerSize = strlen(fPrimer);
00432 int rPrimerSize = strlen(rPrimer);
00433 int maxPrimerSize = max(fPrimerSize, rPrimerSize);
00434 int minPrimerSize = min(fPrimerSize, rPrimerSize);
00435
00436 tolowers(fPrimer);
00437 tolowers(rPrimer);
00438
00439
00440 if (minPrimerSize < minGood || minPrimerSize < minPerfect)
00441 errAbort("Need longer primer in pair %s (%dbp) %s (%dbp). Minimum size is %d\n",
00442 fPrimer, fPrimerSize, rPrimer, rPrimerSize, minGood);
00443
00444
00445 rangeList = gfPcrGetRanges(host, port, fPrimer, rPrimer, maxSize);
00446 for (range = rangeList; range != NULL; range = range->next)
00447 {
00448 int tSeqSize;
00449 char seqName[PATH_LEN];
00450 struct dnaSeq *seq = gfiExpandAndLoadCached(range,
00451 tFileCache, seqDir, 0, &tSeqSize, FALSE, FALSE, maxPrimerSize);
00452 gfiGetSeqName(range->tName, seqName, NULL);
00453 gfPcrLocal(pcrName, seq, range->tStart, seqName, tSeqSize, maxSize,
00454 fPrimer, fPrimerSize, rPrimer, rPrimerSize,
00455 minPerfect, minGood, range->tStrand, pOutList);
00456 dnaSeqFree(&seq);
00457 }
00458 gfRangeFreeList(&rangeList);
00459 }
00460
00461
00462 struct gfPcrOutput *gfPcrViaNet(char *host, char *port, char *seqDir,
00463 struct gfPcrInput *inList, int maxSize, int minPerfect, int minGood)
00464
00465 {
00466 struct hash *tFileCache = gfFileCacheNew();
00467 struct gfPcrInput *in;
00468 struct gfPcrOutput *outList = NULL;
00469
00470 for (in = inList; in != NULL; in = in->next)
00471 {
00472 gfPcrOneViaNet(host, port, seqDir,
00473 in->name, in->fPrimer, in->rPrimer, maxSize,
00474 minPerfect, minGood,
00475 tFileCache, &outList);
00476 }
00477 gfFileCacheFree(&tFileCache);
00478 slReverse(&outList);
00479 return outList;
00480 }
00481
00482 char *gfPcrMakePrimer(char *s)
00483
00484
00485 {
00486 int size = dnaFilteredSize(s);
00487 int realSize;
00488 char *primer = needMem(size+1);
00489 dnaFilter(s, primer);
00490 realSize = size - countChars(primer, 'n');
00491 if (realSize < 10 || realSize < size/2)
00492 errAbort("%s does not seem to be a good primer", s);
00493 return primer;
00494 }
00495