comparison bin/maf-sort @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:06f8460885ff
1 #! /bin/sh
2
3 # Sort MAF-format alignments by sequence name, then strand, then start
4 # position, then end position, of the top sequence. Also, merge
5 # identical alignments. Comment lines starting with "#" are written
6 # at the top, in unchanged order. If option "-d" is specified, then
7 # alignments that appear only once are omitted (like uniq -d).
8
9 # Minor flaws, that do not matter for typical MAF input:
10 # 1) It might not work if the input includes TABs.
11 # 2) Preceding whitespace is considered part of the sequence name. I
12 # want to use sort -b, but it seems to be broken in different ways for
13 # different versions of sort!
14 # 3) Alignments with differences in whitespace are considered
15 # non-identical.
16
17 # This script uses perl instead of specialized commands like uniq.
18 # The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
19 # work with long lines.
20
21 # Make "sort" use a standard ordering:
22 LC_ALL=C
23 export LC_ALL
24
25 uniqOpt=1
26 whichSequence=1
27 while getopts hdn: opt
28 do
29 case $opt in
30 h) cat <<EOF
31 Usage: $(basename $0) [options] my-alignments.maf
32
33 Options:
34 -h show this help message and exit
35 -d only print duplicate alignments
36 -n sort by the n-th sequence (default: 1)
37 EOF
38 exit
39 ;;
40 d) uniqOpt=2
41 ;;
42 n) whichSequence="$OPTARG"
43 ;;
44 esac
45 done
46 shift $((OPTIND - 1))
47
48 baseField=$((6 * $whichSequence))
49 a=$(($baseField - 4))
50 a=$a,$a
51 b=$(($baseField - 1))
52 b=$b,$b
53 c=$(($baseField - 3))
54 c=$c,$c
55 d=$(($baseField - 2))
56 d=$d,$d
57
58 # 1) Add digits to "#" lines, so that sorting won't change their order.
59 # 2) Replace spaces, except in "s" lines.
60 # 3) Join each alignment into one big line.
61 perl -pe '
62 s/^#/sprintf("#%.9d",$c++)/e;
63 y/ /\a/ unless /^s/;
64 y/\n/\b/ if /^\w/;
65 ' "$@" |
66
67 sort -k$a -k$b -k${c}n -k${d}n | # sort the lines
68
69 # Print only the first (or second) of each run of identical lines:
70 perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
71
72 # 1) Remove the digits from "#" lines.
73 # 2) Restore spaces and newlines.
74 perl -pe '
75 s/^#.{9}/#/;
76 y/\a\b/ \n/;
77 '