Mercurial > repos > artbio > repenrich
comparison RepEnrich_setup.py @ 15:2e3d976e7d5d draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
| author | artbio |
|---|---|
| date | Sun, 21 Apr 2024 09:44:51 +0000 |
| parents | bf866bedd4b4 |
| children |
comparison
equal
deleted
inserted
replaced
| 14:bf866bedd4b4 | 15:2e3d976e7d5d |
|---|---|
| 69 # load genome into dictionary and compute length | 69 # load genome into dictionary and compute length |
| 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | 70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
| 71 genome = defaultdict(dict) | 71 genome = defaultdict(dict) |
| 72 | 72 |
| 73 for chr in g.keys(): | 73 for chr in g.keys(): |
| 74 genome[chr]['sequence'] = g[chr].seq | 74 genome[chr]['sequence'] = str(g[chr].seq) |
| 75 genome[chr]['length'] = len(g[chr].seq) | 75 genome[chr]['length'] = len(g[chr].seq) |
| 76 | 76 |
| 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | 77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
| 78 repeat_elements = set() | 78 repeat_elements = set() |
| 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates | 79 rep_coords = defaultdict(list) # Merged dictionary for coordinates |
| 98 # generate spacer for pseudogenomes | 98 # generate spacer for pseudogenomes |
| 99 spacer = ''.join(['N' for i in range(gapl)]) | 99 spacer = ''.join(['N' for i in range(gapl)]) |
| 100 | 100 |
| 101 # generate metagenomes and save them to FASTA files for bowtie build | 101 # generate metagenomes and save them to FASTA files for bowtie build |
| 102 for repname in rep_coords: | 102 for repname in rep_coords: |
| 103 metagenome = '' | 103 genomes_list = [] |
| 104 # iterating coordinate list by block of 3 (chr, start, end) | 104 # iterating coordinate list by block of 3 (chr, start, end) |
| 105 block = 3 | 105 block = 3 |
| 106 for i in range(0, len(rep_coords[repname]) - block + 1, block): | 106 for i in range(0, len(rep_coords[repname]) - block + 1, block): |
| 107 batch = rep_coords[repname][i:i+block] | 107 batch = rep_coords[repname][i:i+block] |
| 108 chromosome = batch[0] | 108 chromosome = batch[0] |
| 109 start = max(int(batch[1]) - flankingl, 0) | 109 start = max(int(batch[1]) - flankingl, 0) |
| 110 end = min(int(batch[2]) + flankingl, | 110 end = min(int(batch[2]) + flankingl, |
| 111 int(genome[chromosome]['length'])-1) + 1 | 111 int(genome[chromosome]['length'])-1) + 1 |
| 112 metagenome = ( | 112 genomes_list.append(genome[chromosome]['sequence'][start:end]) |
| 113 f"{metagenome}{spacer}" | 113 metagenome = spacer.join(genomes_list) |
| 114 f"{genome[chromosome]['sequence'][start:end]}" | |
| 115 ) | |
| 116 | |
| 117 # Create Fasta of repeat pseudogenome | 114 # Create Fasta of repeat pseudogenome |
| 118 fastafilename = f"{repname}.fa" | 115 fastafilename = f"{repname}.fa" |
| 119 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | 116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
| 120 SeqIO.write(record, fastafilename, "fasta") | 117 SeqIO.write(record, fastafilename, "fasta") |
| 121 | 118 |
