Mercurial > repos > pfrommolt > ngsrich
comparison NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip
Uploaded
| author | pfrommolt |
|---|---|
| date | Mon, 21 Nov 2011 08:12:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:89ad0a9cca52 |
|---|---|
| 1 package _main; | |
| 2 | |
| 3 import java.io.File; | |
| 4 import java.io.FileWriter; | |
| 5 import java.io.IOException; | |
| 6 import java.sql.Time; | |
| 7 import java.util.Scanner; | |
| 8 | |
| 9 import middlewares.Misc; | |
| 10 import converters.Read2Wig; | |
| 11 import converters.ReadOnTarget2Wig; | |
| 12 import exceptions.ChromosomeFormatException; | |
| 13 import exceptions.ChromosomeMismatchException; | |
| 14 import exceptions.ChromosomeNotFoundException; | |
| 15 import exceptions.FileFormatException; | |
| 16 import exceptions.GenomeAnnotationException; | |
| 17 import exceptions.NullOrNegativeRangeException; | |
| 18 import exceptions.RangeFormatException; | |
| 19 import exceptions.RangeLimitNotFoundException; | |
| 20 import filters.Filter; | |
| 21 import filters.GenomeFilter; | |
| 22 import filters.ReadFilter; | |
| 23 import filters.TargetFilter; | |
| 24 | |
| 25 /** | |
| 26 * This is a scheduler for the phases of the evaluation. | |
| 27 * | |
| 28 * @author Ali Abdallah | |
| 29 * @version 19.03.2011 | |
| 30 * @since jdk 1.6 | |
| 31 */ | |
| 32 | |
| 33 public class Enrichment { | |
| 34 | |
| 35 String | |
| 36 /** | |
| 37 * The name of the read alignment file. | |
| 38 */ | |
| 39 readFileName, | |
| 40 /** | |
| 41 * the name of the genome annotation file. | |
| 42 */ | |
| 43 genomeFName, | |
| 44 /** | |
| 45 * the ucsc-name of the genome (e.g. hg19). | |
| 46 */ | |
| 47 genomeName, | |
| 48 /** | |
| 49 * the name of the file of the targeted regions. | |
| 50 */ | |
| 51 targetFName, | |
| 52 /** | |
| 53 * the temporary directory. | |
| 54 */ | |
| 55 tmpDir, | |
| 56 /** | |
| 57 * the output directory. | |
| 58 */ | |
| 59 outDir, | |
| 60 /** | |
| 61 * the xml summary file containing various overall statistics. | |
| 62 */ | |
| 63 xmlSummaryFile, | |
| 64 /** | |
| 65 * the name of the detailed coverage data file. | |
| 66 */ | |
| 67 detailsFileName, | |
| 68 /** | |
| 69 * the .bed file containing target-level coverage statistics data. | |
| 70 */ | |
| 71 beddetailsFileName, | |
| 72 /** | |
| 73 * the process number used for unique naming. | |
| 74 */ | |
| 75 proc, | |
| 76 /** | |
| 77 * the name of the read alignment file without extension. | |
| 78 */ | |
| 79 prefix, | |
| 80 /** | |
| 81 * the extension of the read alignment file. | |
| 82 */ | |
| 83 suffix, | |
| 84 /** | |
| 85 * The name of the read alignment file after the reduction. | |
| 86 */ | |
| 87 areadFileName, | |
| 88 /** | |
| 89 * The name of the genome annotation file after the reduction. | |
| 90 */ | |
| 91 aGenomeFName, | |
| 92 /** | |
| 93 * the name of the file of the targeted regions after the reduction. | |
| 94 */ | |
| 95 aTargetFName; | |
| 96 int | |
| 97 /** | |
| 98 * Cutoff for poor coverage [default: 2]. | |
| 99 * | |
| 100 */ | |
| 101 poor, | |
| 102 /** | |
| 103 * Cutoff for high coverage [default: 200]. | |
| 104 */ | |
| 105 high, | |
| 106 details; | |
| 107 | |
| 108 /** | |
| 109 * Construcs an enrichment object and initializes the evaluation parameters. | |
| 110 * @param args the list of all evaluation parameters. | |
| 111 */ | |
| 112 public Enrichment(String... args) { | |
| 113 readFileName = args[0]; | |
| 114 genomeName = args[8]; | |
| 115 targetFName = args[2]; | |
| 116 if (!targetFName.endsWith("bed")) { | |
| 117 System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format."); | |
| 118 } | |
| 119 if (args.length > 3) { | |
| 120 Time time = new Time(System.currentTimeMillis()); | |
| 121 String nano = ""+System.nanoTime(); | |
| 122 | |
| 123 tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+ | |
| 124 time.getHours()+"_"+ | |
| 125 time.getMinutes()+"_"+ | |
| 126 time.getSeconds()+"_"+nano+ "/"; | |
| 127 new File(tmpDir).mkdir(); | |
| 128 new File(tmpDir).setReadable(true); | |
| 129 new File(tmpDir).setWritable(true); | |
| 130 outDir = args[4] + Misc.slash(args[4]); | |
| 131 | |
| 132 if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) { | |
| 133 System.out.println("File " + tmpDir | |
| 134 + " is not a directory.\nProgram stopped."); | |
| 135 System.exit(0); | |
| 136 } else if (!(new File(tmpDir).exists())) { | |
| 137 System.out.println("File " + tmpDir | |
| 138 + " not found.\nProgram stopped."); | |
| 139 System.exit(0); | |
| 140 } | |
| 141 } | |
| 142 if (args.length > 5) { | |
| 143 try { | |
| 144 poor = Integer.parseInt(args[5]); | |
| 145 high = Integer.parseInt(args[6]); | |
| 146 } catch (NumberFormatException nfe) { | |
| 147 System.out | |
| 148 .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values."); | |
| 149 poor = 2; | |
| 150 high = 200; | |
| 151 } | |
| 152 } else { | |
| 153 poor = 2; | |
| 154 high = 200; | |
| 155 } | |
| 156 | |
| 157 genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1]; | |
| 158 | |
| 159 prefix = Misc.prefix(readFileName); | |
| 160 if (args[7] != "none") { | |
| 161 prefix = args[7]; | |
| 162 } | |
| 163 details = Integer.parseInt(args[9]); | |
| 164 | |
| 165 suffix = Misc.suffix(readFileName); | |
| 166 proc = Misc.getHostName(); | |
| 167 areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt"; | |
| 168 aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt"; | |
| 169 aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed"; | |
| 170 detailsFileName = tmpDir + "coverage_" + proc + ".txt"; | |
| 171 xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml"; | |
| 172 beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed"; | |
| 173 } | |
| 174 | |
| 175 public static final String[] GENOME_NAMES = { | |
| 176 "anoGam1", "bosTau4", "cb3", "ce4", "ce6", | |
| 177 "danRer5", "danRer6", "danRer7", "dm2", "dm3", | |
| 178 "galGal2", "galGal3", "hg19", "hg18", "mm8", | |
| 179 "mm9", "panTro3", "rn3", "rn4", "susScr2" | |
| 180 }; | |
| 181 | |
| 182 public int genomeID(String gname){ | |
| 183 for(int i = 0; i < GENOME_NAMES.length; i++) | |
| 184 if(GENOME_NAMES[i].equals(gname)) | |
| 185 return i; | |
| 186 return -1; | |
| 187 } | |
| 188 | |
| 189 /** | |
| 190 * Downloads the specified genome annotation, if internet connection | |
| 191 * exists. | |
| 192 * | |
| 193 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19) | |
| 194 * @return the very same name of the genome. | |
| 195 * @throws GenomeAnnotationException | |
| 196 */ | |
| 197 | |
| 198 public String getGenomeAnnotation() throws GenomeAnnotationException{ | |
| 199 try{ | |
| 200 return downloadGenomeAnnotation(); | |
| 201 }catch(Exception e){ | |
| 202 if(genomeID(genomeName)!=-1) | |
| 203 return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt"; | |
| 204 else if(new File(genomeName).exists()) | |
| 205 return genomeName; | |
| 206 else | |
| 207 throw new GenomeAnnotationException(); | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 public String getChromInfo(){ | |
| 212 //genomeName | |
| 213 return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt"; | |
| 214 } | |
| 215 | |
| 216 public String downloadGenomeAnnotation() { | |
| 217 if (genomeName.indexOf("/") == -1) { | |
| 218 String tmpGenomeDir = tmpDir + genomeName; | |
| 219 try { | |
| 220 if (!new File(tmpGenomeDir).exists()) { | |
| 221 new File(tmpGenomeDir).mkdir(); | |
| 222 } | |
| 223 File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh"); | |
| 224 if (!getGenomeScript.exists()) { | |
| 225 getGenomeScript.createNewFile(); | |
| 226 } | |
| 227 FileWriter fw = new FileWriter(getGenomeScript); | |
| 228 | |
| 229 getGenomeScript.setReadable(true); | |
| 230 getGenomeScript.setWritable(true); | |
| 231 getGenomeScript.setExecutable(true); | |
| 232 getGenomeScript.setExecutable(true); | |
| 233 String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir) | |
| 234 + "refGene.txt.gz"; | |
| 235 fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n" | |
| 236 + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName | |
| 237 + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile | |
| 238 + "\n" + "gunzip " + tmpGenomeFile + "\n"); | |
| 239 fw.close(); | |
| 240 Runtime.getRuntime().exec("sh " + getGenomeScript); | |
| 241 try { | |
| 242 Thread.sleep(10000); | |
| 243 } catch (InterruptedException e) { | |
| 244 e.printStackTrace(); | |
| 245 } | |
| 246 | |
| 247 } catch (IOException e) { | |
| 248 e.printStackTrace(); | |
| 249 } | |
| 250 return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt"; | |
| 251 } else { | |
| 252 return genomeName; | |
| 253 } | |
| 254 } | |
| 255 | |
| 256 /** | |
| 257 * A bridge method which converts a bam file to a sam file for further | |
| 258 * processing. This method assures therefore the compatibility of the | |
| 259 * software with the bam format. | |
| 260 * | |
| 261 * @return the name of the generated sam file. | |
| 262 * @throws IOException | |
| 263 */ | |
| 264 public String bam2sam() throws IOException { | |
| 265 | |
| 266 String[] bampath = readFileName.split("/"); | |
| 267 String bamfile = bampath[bampath.length - 1]; | |
| 268 String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam"; | |
| 269 | |
| 270 File getBam2SamScript = new File(tmpDir + "bam2sam.sh"); | |
| 271 if (!getBam2SamScript.exists()) { | |
| 272 getBam2SamScript.createNewFile(); | |
| 273 } | |
| 274 FileWriter fw = new FileWriter(getBam2SamScript); | |
| 275 fw.write("#!/bin/bash\n" + Misc.binDir() | |
| 276 + "/../thirdparty/samtools/0.1.16/samtools view " | |
| 277 + readFileName + " > " + samfile); | |
| 278 fw.close(); | |
| 279 | |
| 280 Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript); | |
| 281 // try { | |
| 282 // @SuppressWarnings("unused") | |
| 283 // "exitstate" variable unused | |
| 284 //int exitstate = p.waitFor(); | |
| 285 // } catch (InterruptedException e) { | |
| 286 //e.printStackTrace(); | |
| 287 // } | |
| 288 | |
| 289 Scanner stderr=new Scanner(p.getErrorStream()); | |
| 290 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
| 291 stderr.close(); | |
| 292 | |
| 293 return (samfile); | |
| 294 | |
| 295 } | |
| 296 | |
| 297 /** | |
| 298 * First phase of the evaluation. In this phase input files are being | |
| 299 * simplified for further processing. | |
| 300 * | |
| 301 * @throws FileFormatException | |
| 302 * @throws ChromosomeMismatchException | |
| 303 */ | |
| 304 public void reduceFiles() throws FileFormatException, ChromosomeMismatchException { | |
| 305 String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt"; | |
| 306 String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt"; | |
| 307 String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt"; | |
| 308 | |
| 309 | |
| 310 Filter[] filters = {new ReadFilter(readFileName, aReadFileName), | |
| 311 new GenomeFilter(genomeFName, aGenomeFName), | |
| 312 new TargetFilter(targetFName, aTargetFName)}; | |
| 313 | |
| 314 for(Filter f: filters) f.filter(); | |
| 315 | |
| 316 if(!Misc.areChromosomeCompatible(new File(aReadFileName), | |
| 317 new File(aTargetFName))) | |
| 318 throw new ChromosomeMismatchException(); | |
| 319 | |
| 320 System.out.println("\nSTEP 1 successfully completed"); | |
| 321 } | |
| 322 | |
| 323 /** | |
| 324 * Second phase of the evaluation. An EnrichmentStatsComputer object is | |
| 325 * called with the right parameters and computation is checked for formal | |
| 326 * correctness. | |
| 327 */ | |
| 328 public void computeTargetCoverageFiles() { | |
| 329 | |
| 330 File data = new File(outDir + "/data"); | |
| 331 data.mkdir(); | |
| 332 File plots = new File(outDir + "/plots"); | |
| 333 plots.mkdir(); | |
| 334 | |
| 335 EnrichmentStatsComputer esc = | |
| 336 new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir); | |
| 337 | |
| 338 try { | |
| 339 esc.compute(); | |
| 340 } catch (RangeFormatException e) { | |
| 341 e.printStackTrace(); | |
| 342 } catch (ChromosomeFormatException e) { | |
| 343 e.printStackTrace(); | |
| 344 } catch (ChromosomeNotFoundException e) { | |
| 345 e.printStackTrace(); | |
| 346 } catch (RangeLimitNotFoundException e) { | |
| 347 e.printStackTrace(); | |
| 348 } catch (NullOrNegativeRangeException e) { | |
| 349 e.printStackTrace(); | |
| 350 } catch (IOException e) { | |
| 351 e.printStackTrace(); | |
| 352 } | |
| 353 | |
| 354 if (new File(detailsFileName).exists()) { | |
| 355 System.out.println("\nSTEP 2 successfully completed"); | |
| 356 } else { | |
| 357 System.out.println("STEP 2 incomplete."); | |
| 358 } | |
| 359 | |
| 360 } | |
| 361 | |
| 362 /** | |
| 363 * Third phase of the evaluation. Based on the results from phase 2, the | |
| 364 * evaluation is further developed and evaluation data are visualized by | |
| 365 * calling "eval_enrichment.R" Script. | |
| 366 */ | |
| 367 public void evaluate() { | |
| 368 Runtime rt = Runtime.getRuntime(); | |
| 369 try { | |
| 370 String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R"; | |
| 371 int lastSlash = outDir.lastIndexOf("/"); | |
| 372 String outDirR = (lastSlash == outDir.length() - 1) ? outDir | |
| 373 .substring(0, lastSlash) : outDir; | |
| 374 | |
| 375 String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " " | |
| 376 + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " | |
| 377 + targetFName+" "+details; | |
| 378 | |
| 379 Process p=rt.exec(call); | |
| 380 | |
| 381 Scanner stdout = new Scanner(p.getInputStream()); | |
| 382 Scanner stderr=new Scanner(p.getErrorStream()); | |
| 383 while (stdout.hasNextLine()){System.out.println(stdout.nextLine());} | |
| 384 while (stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
| 385 stdout.close(); stderr.close(); | |
| 386 | |
| 387 if(details == 1){ | |
| 388 String chrInfoPath = getChromInfo(); | |
| 389 | |
| 390 //System.out.println("chrInfoPath\t"+chrInfoPath); | |
| 391 script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R"; | |
| 392 outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir; | |
| 393 | |
| 394 String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName | |
| 395 + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " | |
| 396 + genomeName; | |
| 397 Process p2 = rt.exec(call2); | |
| 398 stdout=new Scanner(p2.getInputStream()); | |
| 399 stderr=new Scanner(p2.getErrorStream()); | |
| 400 | |
| 401 while(stdout.hasNextLine()){System.out.println(stdout.nextLine());} | |
| 402 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
| 403 stdout.close(); stderr.close(); | |
| 404 } | |
| 405 | |
| 406 String path = outDir+prefix+"_enrichment.html"; | |
| 407 System.out.println("Created plots and HTML summary report"); | |
| 408 if (new File(path).exists()) { | |
| 409 System.out.println("\nSTEP 3 successfully completed"); | |
| 410 } else { | |
| 411 System.out.println("HTML FILE " + path + " not found"); | |
| 412 System.out.println("\nSTEP 3 unsuccessful"); | |
| 413 } | |
| 414 } | |
| 415 catch (IOException e) { | |
| 416 e.printStackTrace(); | |
| 417 } | |
| 418 } | |
| 419 | |
| 420 /** | |
| 421 * This is the fourth phase. In this phase the detailed datta from phase 2 | |
| 422 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig | |
| 423 * object to accomplish this task. | |
| 424 * | |
| 425 */ | |
| 426 public void computeWiggleFile() { | |
| 427 try { | |
| 428 System.out.println("Computing wiggle file for on-target reads"); | |
| 429 if (!new File(detailsFileName).exists()) | |
| 430 detailsFileName = tmpDir + new File(detailsFileName).getName(); | |
| 431 | |
| 432 new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", | |
| 433 Misc.prefix(readFileName)+ "\nonTarget"); | |
| 434 System.out.println("Wiggle file for on-target reads created"); | |
| 435 System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig"); | |
| 436 File[] outputFiles = new File(outDir + "data/") | |
| 437 .listFiles(); | |
| 438 for (int i = 0; i < outputFiles.length; i++) { | |
| 439 if (outputFiles[i].getName().endsWith("wig") | |
| 440 && outputFiles[i].getName().startsWith( | |
| 441 detailsFileName.substring( | |
| 442 detailsFileName.lastIndexOf("/") + 1, | |
| 443 detailsFileName.lastIndexOf(".")))) { | |
| 444 File f = new File(outDir + "data" | |
| 445 + "/" + Misc.prefix(readFileName) + "_onTarget.wig"); | |
| 446 outputFiles[i].renameTo(f); | |
| 447 break; | |
| 448 } | |
| 449 } | |
| 450 if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){ | |
| 451 System.out.println("\nSTEP 4 successfully completed."); | |
| 452 } else { | |
| 453 System.out.println(outDir+"data/"+ prefix | |
| 454 + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful"); | |
| 455 } | |
| 456 } catch (IOException e) { | |
| 457 System.err.println("Target conversion in Wig unsuccessful"); | |
| 458 e.printStackTrace(); | |
| 459 } | |
| 460 } | |
| 461 | |
| 462 /** | |
| 463 * This is the fifth and last phase of the evaluation process. In contrast | |
| 464 * to the fourth phase, this method uses a Read2Wig object to convert the | |
| 465 * detailed data for all reads (and not only for reads on targets) to the | |
| 466 * wiggle format. | |
| 467 * | |
| 468 */ | |
| 469 void computeOverallWiggleFile() { | |
| 470 try { | |
| 471 System.out.println("Computing wiggle file for all reads"); | |
| 472 new Read2Wig(areadFileName, prefix, outDir + "data", | |
| 473 genomeFName, tmpDir); | |
| 474 String overallWiggleFileName = prefix + ".wig"; | |
| 475 System.out.println("Wiggle file for all reads created"); | |
| 476 System.out.println("Output written to " + outDir + "data/" + | |
| 477 overallWiggleFileName); | |
| 478 if (new File(outDir + "data/" + overallWiggleFileName).exists()) { | |
| 479 System.out.println("\nSTEP 5 successfully completed."); | |
| 480 } else { | |
| 481 System.out.println(outDir + "data/" + overallWiggleFileName | |
| 482 + "not found\n\nSTEP 5 unsuccessful"); | |
| 483 } | |
| 484 } catch (IOException e) { | |
| 485 System.err | |
| 486 .println("Creating wiggle file for all reads was unsuccessful"); | |
| 487 e.printStackTrace(); | |
| 488 } | |
| 489 } | |
| 490 } |
