Mercurial > repos > gdroc > scaffhunter
view scaffhunter/scaff2chrom.py @ 2:ec31e4883c99 draft default tip
Uploaded
author | gdroc |
---|---|
date | Mon, 14 Nov 2016 08:04:53 -0500 |
parents | 9c61692acd7b |
children |
line wrap: on
line source
#!/usr/local/bioinfo/python/2.7.9/bin/python # # Copyright 2014 CIRAD # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/> or # write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator from Bio.Seq import Seq from Bio.Alphabet import generic_dna from Bio import SeqIO from Bio.SeqRecord import SeqRecord def rev_seq(seq): #function that reverse and complement a sequence my_dna = Seq(seq, generic_dna) return str(my_dna.reverse_complement()) def __main__(): #Parse Command Line parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n" "This program takes in input files a multifasta containing scaffolds and a table file containing scaffolds order calculated by reorderient.py and " "output a multifasta file containing reconstructed chromosomes and an agp file locating scaffolds into chromosomes.\n") # Wrapper options. parser.add_option( '', '--table', dest='table', default=None, help='Table file containing in column 1: chromosome name, column 2: scaffold name, column 3: expected orientation (FWD or REV).') parser.add_option( '', '--seq', dest='seq', default=None, help='Multifasta sequence file containing scaffolds.') parser.add_option( '', '--unknown', dest='unknown', default='yes', help='Build an unknown chromosome with the remaining sequences: yes or no, [default: %default].') parser.add_option( '', '--out', dest='out', default='chromosomes.fasta', help='Fasta output file name.') parser.add_option( '', '--agp', dest='agp', default='chromosomes.agp', help='agp output file name.') (options, args) = parser.parse_args() if options.table == None: sys.exit('--table argument is missing') if options.seq == None: sys.exit('--seq argument is missing') #loading sequences record_dict = SeqIO.index(options.seq, "fasta") file = open(options.table) dico = set() chr = '' debut = 0 sequence = '' scaff = '' outfile = open(options.out,'w') outfile2 = open(options.agp,'w') outfile2.write("##agp-version 2.0\n") status = "" for line in file: data = line.split() if data: if data[1] in dico: print 'Warning '+data[1]+' is found more than once' dico.add(data[1]) if not(data[1] in record_dict): print data[1]+' is not in the fasta file' i -= 1 elif chr == '': i = 0 chr = data[0] if len(data) == 2: sequence = str(record_dict[data[1]].seq) else:#contain information on orientation if data[2] == 'FWD': sequence = str(record_dict[data[1]].seq) elif data[2] == 'REV': sequence = rev_seq(str(record_dict[data[1]].seq)) else: sys.exit('The orientation information is not recognized') scaff = data[1] if len(data) >= 3: status = data[2] else: status = "" elif chr == data[0]: i += 1 if status == 'FWD': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') elif status == 'REV': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t-\n') else: outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') debut = len(sequence) i += 1 sequence = sequence + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tN\t100\tfragment\tno\n') debut = len(sequence) if len(data) == 2: sequence = sequence + str(record_dict[data[1]].seq) else:#contain information on orientation if data[2] == 'FWD': sequence = sequence + str(record_dict[data[1]].seq) elif data[2] == 'REV': sequence = sequence + rev_seq(str(record_dict[data[1]].seq)) else: sys.exit('The orientation information is not recognized') scaff = data[1] if len(data) >= 3: status = data[2] else: status = "" else: i += 1 if status == 'FWD': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') elif status == 'REV': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t-\n') else: outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') SeqIO.write(SeqRecord(Seq(sequence, generic_dna), id = chr, description=''),outfile, "fasta") chr = data[0] debut = 0 i = 0 sequence = '' if len(data) == 2: sequence = sequence + str(record_dict[data[1]].seq) else:#contain information on orientation if data[2] == 'FWD': sequence = sequence + str(record_dict[data[1]].seq) elif data[2] == 'REV': sequence = sequence + rev_seq(str(record_dict[data[1]].seq)) else: sys.exit('The orientation information is not recognized') scaff = data[1] if len(data) >= 3: status = data[2] else: status = "" i += 1 if status == 'FWD': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') elif status == 'REV': outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t-\n') else: outfile2.write(chr+'\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') SeqIO.write(SeqRecord(Seq(sequence, generic_dna), id = chr, description=''),outfile, "fasta") #all chromosomes have been constructed, now it is time to build unknown chromosome if options.unknown == 'yes': liste = [] for n in record_dict: if not(n in dico): liste.append([n, len(str(record_dict[n].seq))]) liste = sorted(liste, key=operator.itemgetter(1), reverse = True) sequence = '' debut = 0 i = 0 for n in liste: if sequence == '': sequence = str(record_dict[n[0]].seq) else: i += 1 outfile2.write('chrUn_random\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') debut = len(sequence) i += 1 sequence = sequence + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' outfile2.write('chrUn_random\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tN\t100\tfragment\tno\n') debut = len(sequence) sequence = sequence + str(record_dict[n[0]].seq) scaff = n[0] i += 1 if sequence: outfile2.write('chrUn_random\t'+str(debut+1)+'\t'+str(len(sequence))+'\t'+str(i)+'\tW\t'+scaff+'\t1\t'+str(len(sequence)-debut)+'\t+\n') SeqIO.write(SeqRecord(Seq(sequence, generic_dna), id = 'chrUn_random', description=''),outfile, "fasta") outfile.close() outfile2.close() if __name__ == "__main__": __main__()