Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff phyloconversion/remove_phytab_dupes.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/remove_phytab_dupes.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,60 @@ +#!/usr/bin/perl +use strict; + + + +my $infile=$ARGV[0]; +my $keeplongest=$ARGV[1]; +my $ignoregaps=$ARGV[2]; +my $uniout=$ARGV[3]; +my $dupout=$ARGV[4]; + +open IN, $infile or die "Cannot open $infile\n"; + +my %UniquesHash; +my @DupeArray; + +while(<IN>){ + my $row = $_; + chomp($row); + my @column = split(/\t/, $row); + my $species = $column[0]; + my $partition = $column[1]; + my $id = $column[2]; + my $sequence = $column[3]; + + if(exists $UniquesHash{$species}{$partition}){ + my @dupeseq = split(/\t/, $UniquesHash{$species}{$partition}); + my ($savlen,$curlen); + if($ignoregaps==1){ + my $nogapsav = $dupeseq[1]; + my $nogapcur = $sequence; + $nogapsav =~ s/\-//g; + $nogapcur =~ s/\-//g; + $savlen = length($nogapsav); + $curlen = length($nogapcur); + }else{ + $savlen = length($dupeseq[1]); + $curlen = length($sequence); + } + if($curlen > $savlen && $keeplongest==1) { #current is longer so keep that one + my $oldline = $species."\t".$partition."\t".$UniquesHash{$species}{$partition}."\n"; + $UniquesHash{$species}{$partition} = "$id\t$sequence"; + push(@DupeArray, $oldline); + }else{ + push(@DupeArray, "$species\t$partition\t$id\t$sequence\n"); + } + }else{ + $UniquesHash{$species}{$partition} = "$id\t$sequence"; + } +} + +open OUT, ">".$uniout or die "Cannot open $uniout\n"; +open DUPES, ">".$dupout or die "Cannot open $dupout\n"; + +print DUPES @DupeArray; +for my $spname ( keys %UniquesHash ) { + for my $partname ( keys %{ $UniquesHash{$spname} } ) { + print OUT "$spname\t$partname\t$UniquesHash{$spname}{$partname}\n"; + } +}