Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 15:2e3d976e7d5d draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
author | artbio |
---|---|
date | Sun, 21 Apr 2024 09:44:51 +0000 |
parents | bf866bedd4b4 |
children |
comparison
equal
deleted
inserted
replaced
14:bf866bedd4b4 | 15:2e3d976e7d5d |
---|---|
69 # load genome into dictionary and compute length | 69 # load genome into dictionary and compute length |
70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
71 genome = defaultdict(dict) | 71 genome = defaultdict(dict) |
72 | 72 |
73 for chr in g.keys(): | 73 for chr in g.keys(): |
74 genome[chr]['sequence'] = g[chr].seq | 74 genome[chr]['sequence'] = str(g[chr].seq) |
75 genome[chr]['length'] = len(g[chr].seq) | 75 genome[chr]['length'] = len(g[chr].seq) |
76 | 76 |
77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
78 repeat_elements = set() | 78 repeat_elements = set() |
79 rep_coords = defaultdict(list) # Merged dictionary for coordinates | 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates |
98 # generate spacer for pseudogenomes | 98 # generate spacer for pseudogenomes |
99 spacer = ''.join(['N' for i in range(gapl)]) | 99 spacer = ''.join(['N' for i in range(gapl)]) |
100 | 100 |
101 # generate metagenomes and save them to FASTA files for bowtie build | 101 # generate metagenomes and save them to FASTA files for bowtie build |
102 for repname in rep_coords: | 102 for repname in rep_coords: |
103 metagenome = '' | 103 genomes_list = [] |
104 # iterating coordinate list by block of 3 (chr, start, end) | 104 # iterating coordinate list by block of 3 (chr, start, end) |
105 block = 3 | 105 block = 3 |
106 for i in range(0, len(rep_coords[repname]) - block + 1, block): | 106 for i in range(0, len(rep_coords[repname]) - block + 1, block): |
107 batch = rep_coords[repname][i:i+block] | 107 batch = rep_coords[repname][i:i+block] |
108 chromosome = batch[0] | 108 chromosome = batch[0] |
109 start = max(int(batch[1]) - flankingl, 0) | 109 start = max(int(batch[1]) - flankingl, 0) |
110 end = min(int(batch[2]) + flankingl, | 110 end = min(int(batch[2]) + flankingl, |
111 int(genome[chromosome]['length'])-1) + 1 | 111 int(genome[chromosome]['length'])-1) + 1 |
112 metagenome = ( | 112 genomes_list.append(genome[chromosome]['sequence'][start:end]) |
113 f"{metagenome}{spacer}" | 113 metagenome = spacer.join(genomes_list) |
114 f"{genome[chromosome]['sequence'][start:end]}" | |
115 ) | |
116 | |
117 # Create Fasta of repeat pseudogenome | 114 # Create Fasta of repeat pseudogenome |
118 fastafilename = f"{repname}.fa" | 115 fastafilename = f"{repname}.fa" |
119 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
120 SeqIO.write(record, fastafilename, "fasta") | 117 SeqIO.write(record, fastafilename, "fasta") |
121 | 118 |