diff Perl/remove_duplicates_in_gff.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Perl/remove_duplicates_in_gff.pl	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $file = $ARGV[0];
+my $out = $ARGV[1];
+
+my %h;
+open(F,$file);
+while(<F>){
+	my @infos = split(/\t/,$_);
+	if ($infos[2] eq 'CDS' && /Name=([^;]*);/){
+		my $id = $1;
+		$h{$id}++;
+	}
+}
+close(F);
+
+my %dup;
+foreach my $id(keys(%h)){
+	my $n = $h{$id};
+	if ($n > 1){
+		$dup{$id} = 1;
+	}
+}
+open(O,">$out");
+open(F,$file);
+while(<F>){
+        my @infos = split(/\t/,$_);
+        if ($infos[2] eq 'CDS' && /Name=([^;]*);/){
+		my $id = $1;
+		if ($dup{$id}){next;}
+	}
+	if ($infos[2] eq 'mRNA' && /Parent=([^;]*);/){
+                my $id = $1;
+                if ($dup{$id}){next;}
+        }
+	if ($infos[2] eq 'gene' && /ID=([^;]*);/){
+                my $id = $1;
+                if ($dup{$id}){next;}
+        }
+	if ($infos[2] eq 'exon' && /Parent=([^;]*);/){
+                my ($id,$extension) = split(/\./,$1);
+                if ($dup{$id}){next;}
+        }
+	print O $_;
+
+}
+close(F);
+close(O);