Mercurial > repos > jjohnson > fastq_sync
changeset 0:751f4938cf0d
Uploaded
author | jjohnson |
---|---|
date | Tue, 05 Feb 2013 15:23:18 -0500 |
parents | |
children | b0ab279b5add |
files | README pe_sync.xml pe_sync_2_files.pl resync.pl resync.xml test-data/reads1.fastqsanger test-data/reads2.fastqsanger test-data/reads_unsync_1.fastqsanger test-data/reads_unsync_2.fastqsanger |
diffstat | 9 files changed, 387 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,7 @@ +These tools check and fix synchronization of paired read fastq files. +Those files may get out of synchronization due to quality filtering. + +These script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. +Other types of read ID formats may cause a terminal error. + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pe_sync.xml Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,32 @@ +<tool id="pe_sync" name="pe-sync: Paired-end synchronization check" version="1.0"> + <description>The Paired-end synchronization check program determines if the reads in paired-end fastq files are in the proper order (synchronized).</description> + <command interpreter="perl"> + pe_sync_2_files.pl $quickcheck $input1 $input2 2>&1 | tee $output + </command> + <inputs> + <param name="input1" type="data" format="fastq" label="Input 1"/> + <param name="input2" type="data" format="fastq" label="Input 2"/> + <param name="quickcheck" type="boolean" truevalue="quick" falsevalue="" checked="false" label="Just check a sample of the sequences" + help=""/> + </inputs> + <stdio> + <exit_code range="1:" level="fatal" description="Bad input dataset" /> + </stdio> + <outputs> + <data format="txt" name="output" label="pe-sync report for ${input1.name} and ${input2.name}"/> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +Paired-end read file synchronization checks if left and right reads are in the same order. + +The "quick" option will not report the percent of out-of-sync reads +for many failed files, but will run much faster. + +This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. +Other types of read ID formats may cause a terminal error. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pe_sync_2_files.pl Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/perl -w + +############################################### +# pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl +# John Garbe +# February 2012 +# +# Paired-end read file synchronization script: checks if left and right +# reads are in the same order. +# This script checks the two files specificd on the command line. +# The "quick" option will not report the percent of out-of-sync reads +# for many failed files, but will run much faster. +# +# This script should run fine on linux systems or solaris +# (which is what the project space fileserver runs) +# +# This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. +# Other types of read ID formats will cause a terminal error. +# +############################################### + +die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0); + +# determine if we're running on solaris in order to change system calls +$output = `perl -v`; +$solaris = 0; +$solaris = 1 if ($output =~ /solaris/); +if ($solaris) { +# print STDERR "Running on Solaris\n"; +} + +# handle "quick" option +$quick = 0; # 1 for quickcheck, 0 for thorough check +if (defined($ARGV[0])) { + if ($ARGV[0] eq "quick") { + $quick = 1; + shift @ARGV; + } +} +print STDERR "QUICK CHECK enabled\n" if ($quick); + +$filename_r1 = $ARGV[0]; +$filename_r2 = $ARGV[1]; + +# identify read id format type +if ($solaris) { + $readid = `head -1 $filename_r1`; +} else { + $readid = `head -n 1 $filename_r1`; +} +chomp $readid; +if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line + $id_type = 1; + print STDERR "Casava 1.7 read id style\n"; # TESTING +} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace + $id_type = 2; + print STDERR "Casava 1.8 read id style\n"; # TESTING +} else { + print STDERR "Cannot determine read id style\n"; + print STDOUT "Unknwon id style: $readid\n"; + exit 1; +} + +chomp $filename_r1; +chomp $filename_r2; + +if ($quick) { + $failure = &quickcheck($filename_r1, $filename_r2); # quickie check + print STDERR "FAILED quickcheck\n" if ($failure); +} +# If we aren't doing the quick check, or the quick check passed, do the slow check +if (!($quick) || !($failure)) { + $failure = &percent($filename_r1, $filename_r2); # slow thorough check +} + +exit; + +######################## subs ######################### +# do a quick check to see if the files are bad (many bad files will pass this quick check) +sub quickcheck { + my ($file1, $file2) = @_; + if ($solaris) { + $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF + $result2 = `tail -10000l $file2 | head -1`; + } else { + $result1 = `tail -n 10000 $file1 | head -n 1`; + $result2 = `tail -n 10000 $file2 | head -n 1`; + } + # process the ids depending on id style + if ($id_type == 1) { # 1 or 2 at end of id + chomp $result1; + chomp $result2; + chop $result1; + chop $result2; + } else { # first word is a unique read id, second word is left/right-specific + ($result1) = split ' ', $result1; + ($result2) = split ' ', $result2; + } + if ($result1 eq $result2) { +# print "pass\n"; + return 0; + } else { +# print "fail\n"; + return 1; + } +} + + +# compare every read id from each file and determine what percent of reads are +# out of sync +sub percent { + my($file1, $file2) = @_; + + open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!"; + open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!"; + + my $readCount = 0; + my $failcount = 0; + while (!eof(R1) and !eof(R2)) { + $id1 = <R1>; + $id2 = <R2>; + next unless ($. % 4 == 1); # check every sequence (4th line in the file) + # process the ids depending on id style + if ($id_type == 1) { # 1 or 2 at end of id + chomp $id1; # newline + chop $id1; # last char of the id (should be the "1" or "2") + chomp $id2; + chop $id2; + } else { + ($id1) = split ' ', $id1; + ($id2) = split ' ', $id2; + } + $failcount++ unless ($id1 eq $id2); + $readCount++; + } + my $percent = $failcount / $readCount; + $prettypercent = $percent * 100; # make it percent + $prettypercent = sprintf("%.2f", $prettypercent); + if ($failcount > 0) { + print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n"; + $exitStatus=1; + } else { + print STDERR "PASSED full check\n"; + $exitStatus=0; + } + + return $exitStatus; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/resync.pl Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +########################################################### +# resync.pl +# John Garbe +# June 2012 +# +# Resynchronize a pair of paired-end fastq files. Read in two unsynchronized +# fastq files and write out two synchronized fastq files. The +# synchronized files have properly paired reads, with singelton reads removed. +# +# +########################################################### + +die "USAGE: resync.pl sample1_R1.fastq sample1_R2.fastq [sample1_R1_synced.fastq] [sample1_R2_synced.fastq]\n" unless ($#ARGV >= 1 && $#ARGV <= 3); + +# open up input files +open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n"; +open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n"; + +# open up output files +if ($#ARGV > 1) { + open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n"; +} else { + open O1, ">$ARGV[0].out" or die "cannot open $ARGV[0].out\n"; +} +if ($#ARGV == 3) { + open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n"; +} else { + open O2, ">$ARGV[1].out" or die "cannot open $ARGV[1].out\n"; +} + +$readid = `head -n 1 $ARGV[0]`; +chomp $readid; +if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line + $id_type = 1; + print STDERR "Casava 1.7 read id style\n"; # TESTING +} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace + $id_type = 2; + print STDERR "Casava 1.8 read id style\n"; # TESTING +} else { + print STDERR "Cannot determine read id style\n"; + print STDOUT "Unknwon id style: $readid\n"; + exit 1; +} + + +# read in the first fastq file and store in a hash +$f1readcount = 0; +while ($f1line1 = <F1>) { + $f1readcount++; + if ($f1readcount % 1000000 == 0) { # print progress update + print STDERR "$f1readcount reads processed in first file\n"; + } + $f1line2 = <F1>; + $f1line3 = <F1>; + $f1line4 = <F1>; + + ($id, $junk) = split /\//, $f1line1; + $r1{$id} = $f1line1 . $f1line2 . $f1line3 . $f1line4; +} + +# read in the second fastq file, printing out both read pairs if available, otherwise tossing the reads +$f2readcount = 0; +$f1missed = $f1readcount; +$f2missed = 0; +while ($f2line1 = <F2>) { + $f2readcount++; + if ($f2readcount % 1000000 == 0) { # print progress update + print STDERR "$f2readcount reads processed in second file\n"; + } + $f2line2 = <F2>; + $f2line3 = <F2>; + $f2line4 = <F2>; + + ($id, $junk) = split /\//, $f2line1; + if (defined($r1{$id})) { + $f1missed--; + print O1 "$r1{$id}"; + print O2 "$f2line1"; + print O2 "$f2line2"; + print O2 "$f2line3"; + print O2 "$f2line4"; + } else { + $f2missed++; + } +} + +print "$f1readcount reads in $ARGV[0], $f1missed without mate\n"; +print "$f2readcount reads in $ARGV[1], $f2missed without mate\n"; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/resync.xml Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,32 @@ +<tool id="resync" name="resync: Paired-end resynchronization" version="1.0"> + <description>Resynchronize a pair of paired-end fastq files</description> + <command interpreter="perl"> + resync.pl $input1 $input2 $output1 $output2 + </command> + <inputs> + <param name="input1" type="data" format="fastq" label="Input 1"/> + <param name="input2" type="data" format="fastq" label="Input 2"/> + </inputs> + <stdio> + <exit_code range="1:" level="fatal" description="Bad input dataset" /> + </stdio> + <outputs> + <data format_source="input1" name="output1" label="resync ${input1.name}"/> + <data format_source="input2" name="output2" label="resync ${input2.name}"/> + </outputs> + <tests> + <test> + </test> + </tests> + <help> +Resynchronize a pair of paired-end fastq files. + +Reads in two potentially unsynchronized fastq files and writes out two synchronized fastq files. + +This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. +Other types of read ID formats may cause a terminal error. + +The synchronized files have properly paired reads, with singelton reads removed. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reads1.fastqsanger Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,20 @@ +@HWUSI-EAS1737:7:1:4411:1170#CAGATC/1 +AGAAGGATATGGTGAAGAAATAGCCTGCACTCAGAATGGCCAGATGTACTTAAACAGGGACATTTGGAAACCTGCC ++ +IIGHIIIGIIIIGIIIIFHGIIIIIIIHIIHIIHIIIIHHIIHIIIGIHIIHIIIIIIIHIHHIIIIIIIIIHIII +@HWUSI-EAS1737:7:1:16187:1196#CAGATC/1 +TACGACTACGTGCGTGAGTTTAAAAGAAGCACAGGAACCTGGCACCACACTAAACCACTCCTTCCATCCGACCTTC ++ +GHHHHHHHHDHHHHHGDGGGGGGGDHHHAHEEGGGGGGFGGEGGGHHHHHHHHHHHHHHHHFBHHBEEFEGGEGCH +@HWUSI-EAS1737:7:1:19051:1194#CAGATC/1 +AAGTACCTGCTTTCAACGTGTTGAGGGTTGACATAGGTGCTTGAAGAACAGAATGTAACATTTTGTGGTGTAAAAT ++ +IIIIIIIIIIIIIIIIIIIGIIIIIIHIIHIIHIIIIGIIIIHIIIIIIIIIHIIHIIIIHIHIHFIHGGEIHHHI +@HWUSI-EAS1737:7:1:10050:1205#CAGATC/1 +ATATGCACATTTTTAGAATTCCATTATCTTGAATACAAATAGGGAAATATCTTTTGGAAATTAAAATAATTTCAAA ++ +GHHHHHFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH +@HWUSI-EAS1737:7:1:16633:1206#CAGATC/1 +GGAGGAACAGTAAGCATGTATTTTCTGATGACGTATTTTCTGACAATTTCTATGGAAGAAGCTTCCCCCATGCCCT ++ +HHHHHHHHHHFHHFHHGHGHHHHHHHHHHHHHHHGHHHHHHHGHHGHHHHHHHHHGHHHHHHHHHHHGHHCFEGF>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reads2.fastqsanger Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,20 @@ +@HWUSI-EAS1737:7:1:4411:1170#CAGATC/2 +....AGCAAAATCATTAATCATTAACCCCAAAAGCCACCAAAAAAGCTGCTTAATTATTCTCTTGAGTTGCTGAAT ++ +####)////-@CC@C@@@@@@@@@@CCCCCCC@@;@@@@C@C@@44@@@CC@@C@@@@CCC@@CC@@@@@@C@@@@ +@HWUSI-EAS1737:7:1:16187:1196#CAGATC/2 +.CCCAAACAAGAGCTAAAGCTCAAGTTTGTAAGCATCATAAAACCACCAAAGAGTTACCATACCAAAGCCCAGCAC ++ +#*,'&-*+*)*6636@@@@@:<<<<16544@@@@@@@@@@@@44@@@@@@@5@@@<<<<572777@@@@@@@@@@C +@HWUSI-EAS1737:7:1:19051:1194#CAGATC/2 +.TTTTCTTGCAATAATAGTATGATTTTGAGGTTAAGGGTGCATGCTCTTCTAATGCAAAATATTGTATTTATTTAG ++ +#110071117CCCCCCCC@C@@C@@@C@@C@@@@@<::<<@@@@@@C@CC@C@C@CC@@@<44<<@@@C@C@444@ +@HWUSI-EAS1737:7:1:10050:1205#CAGATC/2 +GATGATAACTAACCTTTTAAACTCATCAACTGGACAGGTTTCATAGCGGCAGACATTAAAGGTACAGGTTCCTGGA ++ +IHIIIIHGIIIIIIIIIIIHIIIIIIHIIIHIIIIHIIIIIIHIIIIIIHDIHHGIIFIHIDBHHDDD@DIFFFEE +@HWUSI-EAS1737:7:1:16633:1206#CAGATC/2 +GAAACCCTTGGCCAAAAACTACCTCTCTGTTGCAGGCTCCCTGCAGCTCGATCTCAACCGCATGCCCAAGCCAGCC ++ +IIIHIIHGIIHHIIIIGIEIGDIGGBGGGGIIIIIIH>IIIIGEIBBEGFBHHDBEDEFBE<3CEAC>???;-8??
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reads_unsync_1.fastqsanger Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,20 @@ +@HWUSI-EAS1737:7:1:4411:1170#CAGATC/1 +AGAAGGATATGGTGAAGAAATAGCCTGCACTCAGAATGGCCAGATGTACTTAAACAGGGACATTTGGAAACCTGCC ++ +IIGHIIIGIIIIGIIIIFHGIIIIIIIHIIHIIHIIIIHHIIHIIIGIHIIHIIIIIIIHIHHIIIIIIIIIHIII +@HWUSI-EAS1737:7:1:10050:1205#CAGATC/1 +ATATGCACATTTTTAGAATTCCATTATCTTGAATACAAATAGGGAAATATCTTTTGGAAATTAAAATAATTTCAAA ++ +GHHHHHFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH +@HWUSI-EAS1737:7:1:16187:1196#CAGATC/1 +TACGACTACGTGCGTGAGTTTAAAAGAAGCACAGGAACCTGGCACCACACTAAACCACTCCTTCCATCCGACCTTC ++ +GHHHHHHHHDHHHHHGDGGGGGGGDHHHAHEEGGGGGGFGGEGGGHHHHHHHHHHHHHHHHFBHHBEEFEGGEGCH +@HWUSI-EAS1737:7:1:19051:1194#CAGATC/1 +AAGTACCTGCTTTCAACGTGTTGAGGGTTGACATAGGTGCTTGAAGAACAGAATGTAACATTTTGTGGTGTAAAAT ++ +IIIIIIIIIIIIIIIIIIIGIIIIIIHIIHIIHIIIIGIIIIHIIIIIIIIIHIIHIIIIHIHIHFIHGGEIHHHI +@HWUSI-EAS1737:7:1:16633:1206#CAGATC/1 +GGAGGAACAGTAAGCATGTATTTTCTGATGACGTATTTTCTGACAATTTCTATGGAAGAAGCTTCCCCCATGCCCT ++ +HHHHHHHHHHFHHFHHGHGHHHHHHHHHHHHHHHGHHHHHHHGHHGHHHHHHHHHGHHHHHHHHHHHGHHCFEGF>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reads_unsync_2.fastqsanger Tue Feb 05 15:23:18 2013 -0500 @@ -0,0 +1,16 @@ +@HWUSI-EAS1737:7:1:4411:1170#CAGATC/2 +....AGCAAAATCATTAATCATTAACCCCAAAAGCCACCAAAAAAGCTGCTTAATTATTCTCTTGAGTTGCTGAAT ++ +####)////-@CC@C@@@@@@@@@@CCCCCCC@@;@@@@C@C@@44@@@CC@@C@@@@CCC@@CC@@@@@@C@@@@ +@HWUSI-EAS1737:7:1:19051:1194#CAGATC/2 +.TTTTCTTGCAATAATAGTATGATTTTGAGGTTAAGGGTGCATGCTCTTCTAATGCAAAATATTGTATTTATTTAG ++ +#110071117CCCCCCCC@C@@C@@@C@@C@@@@@<::<<@@@@@@C@CC@C@C@CC@@@<44<<@@@C@C@444@ +@HWUSI-EAS1737:7:1:10050:1205#CAGATC/2 +GATGATAACTAACCTTTTAAACTCATCAACTGGACAGGTTTCATAGCGGCAGACATTAAAGGTACAGGTTCCTGGA ++ +IHIIIIHGIIIIIIIIIIIHIIIIIIHIIIHIIIIHIIIIIIHIIIIIIHDIHHGIIFIHIDBHHDDD@DIFFFEE +@HWUSI-EAS1737:7:1:16633:1206#CAGATC/2 +GAAACCCTTGGCCAAAAACTACCTCTCTGTTGCAGGCTCCCTGCAGCTCGATCTCAACCGCATGCCCAAGCCAGCC ++ +IIIHIIHGIIHHIIIIGIEIGDIGGBGGGGIIIIIIH>IIIIGEIBBEGFBHHDBEDEFBE<3CEAC>???;-8??