Mercurial > repos > pfrommolt > ngsrich

package filters;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import middlewares.Misc;
import datastructures.Frame;
import datastructures.TargetLine;
import exceptions.ChromosomeFormatException;
import exceptions.ChromosomeNotFoundException;
import exceptions.NullOrNegativeRangeException;
import exceptions.RangeFormatException;
import exceptions.RangeLimitNotFoundException;

public class TargetFilter extends Filter {

	// variables used for test purposes.
	public int target_size, target_regions;

	public TargetFilter(String input, String output) {
		super(input, output);
		target_size = 0;
		target_regions = 0;
	}

	public void filter() {
		try {

			Scanner s = new Scanner(new File(getInputPath()));
			FileWriter fw;
			System.out.println("TARGET REGIONS FILE:");
			/*
			 * 1. STEP:
			 * a. Remove all but the first track.
			 * b. Remove browser and track header lines of the first track
			 * c. Save the output in the output directory and return the path of the output file.
			 */
			String finput = filterTracks(s);
			setInputPath(finput);

			/*
			 * 2. STEP:
			 * a. Sort the target region file lexicographically by chromomose-name.
			 * b. If chr-names are identical then numerically by the start position.
			 * c. If start positions are identical then numerically by the end position.
			 */
			sort();

			/*
			 * 3. STEP:
			 * Unify all overlapping target regions.
			 */
			s = new Scanner(new File(getInputPath()));
			fw = new FileWriter(getOutputPath());
			// for all target regions do the following:
			if (s.hasNextLine()) {
				try {
					// parse the current target region.
					TargetLine tl = new TargetLine(s.nextLine());
					// create a new frame representing the current target region.
					Frame union = new Frame(tl.start(), tl.end() - tl.start() + 1);
					String chrom = tl.chrom();
					while (s.hasNextLine()) {
						// parse the next target region.
						tl = new TargetLine(s.nextLine());
						// create the corresponding frame.
						Frame nextTarget = new Frame(tl.start(), tl.end() - tl.start() + 1);
						String nextChrom = tl.chrom();
						// if current and next overlap each other.
						if (union.overlaps(nextTarget) && chrom.equals(nextChrom)) {
							// unify regions.
							union = union.unify(nextTarget);
							// go to the next iteration.
							continue;
						}
						// else
						// write the computed overlap-free target region.
						target_size += union.end()-union.start()+1;
						target_regions++;
						fw.write(chrom +"\t"+ union.start() +"\t"+ union.end() +"\n");
						// refresh the union and chrom variables for the next computation.
						union = nextTarget;
						chrom = nextChrom;
					}

					try {
						target_size += union.end()-union.start()+1;
						target_regions++;
						fw.write(chrom+"\t"+union.start()+"\t"+union.end()+"\n");
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}

					fw.close();
				} catch (RangeFormatException e) {
					e.printStackTrace();
				} catch (ChromosomeFormatException e) {
					e.printStackTrace();
				} catch (ChromosomeNotFoundException e) {
					e.printStackTrace();
				} catch (RangeLimitNotFoundException e) {
					e.printStackTrace();
				} catch (NullOrNegativeRangeException e) {
					e.printStackTrace();
				}
			}
		} catch (FileNotFoundException fnfe) {
			fnfe.printStackTrace();
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
         System.out.println(getInputPath()+" reduced to "+getOutputPath());
	}

	/**
	 * 	1. Removes all but the first track.
	 * 	2. Removes browser and track header lines of the first track
	 * 	3. Saves the output in the output directory and return the path of the output file.
	 *
	 * @param s the scanner reading the raw file.
	 * @return the path of the output file.
	 * @throws IOException if writing fails.
	 */
	private String filterTracks(Scanner s) throws IOException {

		String finput = Misc.path(getOutputPath())+ Misc.prefix(getInputPath()) + ".bed";
		FileWriter fw = new FileWriter(finput);
		// Counters for browser header lines and track header lines.
		int browserRead = 0;
		int trackRead = 0;
		while (s.hasNextLine()) {
			String line = s.nextLine();
			// Count browser lines.
			if (line.startsWith("browser")) {
				browserRead++;
			// Count header lines.
			} else if (line.startsWith("track")) {
				trackRead++;
			} else {
				// Write lines as long as they corresponds to the first track.
				if (trackRead <= 1 && browserRead <= 1)
					fw.write(line + "\n");
				else
				// Otherwise cancel the computation.
					break;
			}
		}
		fw.close();
		return finput;
	}

	/**
	 *  1. Sort the target region file lexicographically by chromomose-name.
	 * 	2. If chr-names are identical then numerically by the start position.
	 * 	3. If start positions are identical then numerically by the end position.
	 */
	public void sort() {
		Runtime rt = Runtime.getRuntime();
		try {
			String unsorted = getInputPath();
			String tmpD = new File(getOutputPath()).getParentFile().getAbsolutePath();
			String sorted = tmpD+Misc.slash(tmpD)+Misc.prefix(unsorted)+"Sorted";
			setInputPath(sorted);

			if(!new File(getInputPath()).exists())
				new File(getInputPath()).createNewFile();

			String command = "sort -k1,1 -k2n,2 -k3n,3 -T "+tmpD+" "+unsorted;
			Process p = rt.exec(command);
			Scanner ps = new Scanner(p.getInputStream());
			FileWriter fw = new FileWriter(getInputPath());
			while(ps.hasNextLine()){
				String nextLine = ps.nextLine();
				fw.write(nextLine+"\n");
			}
			fw.close();
			new File(sorted).renameTo(new File(unsorted));
			setInputPath(unsorted);
			System.out.println("Target file "+new File(unsorted).getAbsolutePath()+" sorted");

		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}

	public static void main(String[] args){
		new TargetFilter("/home/abdallah/Desktop/input/Agilent_SureSelect_50Mb.bed",
						 "/home/abdallah/Desktop/output/Agilent_SureSelect_50Mb_Output.bed")
		.filter();
	}

}
author	pfrommolt
date	Mon, 21 Nov 2011 08:12:19 -0500
parents
children