Mercurial > repos > peterjc > fastq_filter_by_id
diff tools/fastq/fastq_filter_by_id.py @ 3:e0041942a12d draft default tip
v0.0.5 - galaxy_sequence_utils dependency and other cleanups inc using MIT license
author | peterjc |
---|---|
date | Fri, 03 Feb 2017 05:34:18 -0500 |
parents | d570cc324779 |
children |
line wrap: on
line diff
--- a/tools/fastq/fastq_filter_by_id.py Tue Jun 07 17:24:08 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ -#!/usr/bin/env python -"""Filter a FASTQ file with IDs from a tabular file, e.g. from BLAST. - -NOTE - This script is now OBSOLETE, having been replaced by a new verion -which handles FASTA, FASTQ and SFF all in one. - -Takes five command line options, tabular filename, ID column numbers -(comma separated list using one based counting), input FASTA filename, and -two output FASTA filenames (for records with and without the given IDs). - -If either output filename is just a minus sign, that file is not created. -This is intended to allow output for just the matched (or just the non-matched) -records. - -Note in the default NCBI BLAST+ tabular output, the query sequence ID is -in column one, and the ID of the match from the database is in column two. -Here sensible values for the column numbers would therefore be "1" or "2". - -This script is copyright 2010-2011 by Peter Cock, SCRI, UK. All rights reserved. -See accompanying text file for licence details (MIT/BSD style). - -This is version 0.0.4 of the script. -""" -import sys -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -#Parse Command Line -try: - tabular_file, cols_arg, in_file, out_positive_file, out_negative_file = sys.argv[1:] -except ValueError: - stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) -try: - columns = [int(arg)-1 for arg in cols_arg.split(",")] -except ValueError: - stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) - -#Read tabular file and record all specified identifiers -ids = set() -handle = open(tabular_file, "rU") -if len(columns)>1: - #General case of many columns - for line in handle: - if line.startswith("#"): - #Ignore comments - continue - parts = line.rstrip("\n").split("\t") - for col in columns: - ids.add(parts[col]) - print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) -else: - #Single column, special case speed up - col = columns[0] - for line in handle: - if not line.startswith("#"): - ids.add(line.rstrip("\n").split("\t")[col]) - print "Using %i IDs from tabular file" % (len(ids)) -handle.close() - -#Write filtered FASTQ file based on IDs from tabular file -reader = fastqReader(open(in_file, "rU")) -if out_positive_file != "-" and out_negative_file != "-": - print "Generating two FASTQ files" - positive_writer = fastqWriter(open(out_positive_file, "w")) - negative_writer = fastqWriter(open(out_negative_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the @ on the identifer. - if record.identifier and record.identifier.split()[0][1:] in ids: - positive_writer.write(record) - else: - negative_writer.write(record) - positive_writer.close() - negative_writer.close() -elif out_positive_file != "-": - print "Generating matching FASTQ file" - positive_writer = fastqWriter(open(out_positive_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the @ on the identifer. - if record.identifier and record.identifier.split()[0][1:] in ids: - positive_writer.write(record) - positive_writer.close() -elif out_negative_file != "-": - print "Generating non-matching FASTQ file" - negative_writer = fastqWriter(open(out_negative_file, "w")) - for record in reader: - #The [1:] is because the fastaReader leaves the @ on the identifer. - if not record.identifier or record.identifier.split()[0][1:] not in ids: - negative_writer.write(record) - negative_writer.close() -else: - stop_err("Neither output file requested") -reader.close()