text_processing: awk.xml annotate

annotate awk.xml @ 0:5314e5d6f040 draft

Imported from capsule None

author	bgruening
date	Thu, 29 Jan 2015 07:53:17 -0500
parents
children	20344ce0c811

rev	line source
0 5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	1 <tool id="tp_awk_tool" name="Text reformatting" version="@BASE_VERSION@.0">
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	2 <description>with awk</description>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	3 <macros>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	4 <import>macros.xml</import>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	5 </macros>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	6 <expand macro="requirements">
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	7 <requirement type="package" version="4.1.0">gnu_awk</requirement>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	8 </expand>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	9 <version_command>awk --version \| head -n 1</version_command>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	10 <command>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	11 <![CDATA[
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	12 awk
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	13 --sandbox
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	14 -v FS=' '
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	15 -v OFS=' '
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	16 --re-interval
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	17 -f "$awk_script"
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	18 "$infile"
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	19 > "$outfile"
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	20 ]]>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	21 </command>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	22 <inputs>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	23 <param name="infile" format="txt" type="data" label="File to process" />
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	24 <param name="code" type="text" area="true" size="5x35" label="AWK Program" help="">
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	25 <sanitizer>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	26 <valid initial="string.printable">
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	27 <remove value="'"/>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	28 </valid>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	29 </sanitizer>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	30 </param>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	31 </inputs>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	32 <configfiles>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	33 <configfile name="awk_script">$code</configfile>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	34 </configfiles>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	35 <outputs>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	36 <data name="outfile" format_source="infile" metadata_source="infile"/>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	37 </outputs>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	38 <tests>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	39 <test>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	40 <param name="infile" value="awk1.txt" />
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	41 <!-- commas are not allowed in a value field. Values with comma will be splitted -->
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	42 <param name="code" value='$2>0.5 { print $2*9"\t"$1 }' />
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	43 <output name="outfile" file="awk_results1.txt" />
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	44 </test>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	45 </tests>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	46 <help>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	47 <![CDATA[
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	48 What it does
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	49
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	50 This tool runs the unix awk command on the selected data file.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	51
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	52 .. class:: infomark
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	53
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	54 TIP:
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	55
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	56 This tool uses the extended regular expression syntax (not the perl syntax).
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	57 \\d, \\w, \\s etc. are not supported.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	58
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	59
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	60 Further reading
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	61
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	62 - Awk by Example (http://www.ibm.com/developerworks/linux/library/l-awk1.html)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	63 - Long AWK tutorial (http://www.grymoire.com/Unix/Awk.html)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	64 - Learn AWK in 1 hour (http://www.selectorweb.com/awk.html)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	65 - awk cheat-sheet (http://cbi.med.harvard.edu/people/peshkin/sb302/awk_cheatsheets.pdf)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	66 - Collection of useful awk one-liners (http://student.northpark.edu/pemente/awk/awk1line.txt)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	67
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	68 -----
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	69
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	70 AWK programs
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	71
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	72 Most AWK programs consist of patterns (i.e. rules that match lines of text) and actions (i.e. commands to execute when a pattern matches a line).
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	73
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	74 The basic form of AWK program is::
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	75
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	76 pattern { action 1; action 2; action 3; }
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	77
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	78
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	79 Pattern Examples
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	80
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	81 - $2 == "chr3" will match lines whose second column is the string 'chr3'
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	82 - $5-$4>23 will match lines that after subtracting the value of the fourth column from the value of the fifth column, gives value alrger than 23.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	83 - /AG..AG/ will match lines that contain the regular expression AG..AG (meaning the characeters AG followed by any two characeters followed by AG). (This is the way to specify regular expressions on the entire line, similar to GREP.)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	84 - $7 ~ /A{4}U/ will match lines whose seventh column contains 4 consecutive A's followed by a U. (This is the way to specify regular expressions on a specific field.)
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	85 - 10000 < $4 && $4 < 20000 will match lines whose fourth column value is larger than 10,000 but smaller than 20,000
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	86 - If no pattern is specified, all lines match (meaning the action part will be executed on all lines).
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	87
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	88
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	89 Action Examples
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	90
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	91 - { print } or { print $0 } will print the entire input line (the line that matched in pattern). $0 is a special marker meaning 'the entire line'.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	92 - { print $1, $4, $5 } will print only the first, fourth and fifth fields of the input line.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	93 - { print $4, $5-$4 } will print the fourth column and the difference between the fifth and fourth column. (If the fourth column was start-position in the input file, and the fifth column was end-position - the output file will contain the start-position, and the length).
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	94 - If no action part is specified (not even the curly brackets) - the default action is to print the entire line.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	95
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	96
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	97 AWK's Regular Expression Syntax
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	98
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	99 The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	100
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	101 - *( ) { } [ ] . ? + \ ^ $ are all special characters. \\** can be used to "escape" a special character, allowing that special character to be searched for.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	102 - ^ matches the beginning of a string(but not an internal line).
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	103 - ( .. ) groups a particular pattern.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	104 - { n or n, or n,m } specifies an expected number of repetitions of the preceding pattern.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	105
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	106 - {n} The preceding item is matched exactly n times.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	107 - {n,} The preceding item ismatched n or more times.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	108 - {n,m} The preceding item is matched at least n times but not more than m times.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	109
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	110 - [ ... ] creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as a-z.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	111 - . Matches any single character except a newline.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	112 - ***** The preceding item will be matched zero or more times.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	113 - ? The preceding item is optional and matched at most once.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	114 - + The preceding item will be matched one or more times.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	115 - ^ has two meaning:
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	116 - matches the beginning of a line or string.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	117 - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	118 - $ matches the end of a line or string.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	119 - \\| Separates alternate possibilities.
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	120
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	121 @REFERENCES@
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	122 ]]>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	123 </help>
5314e5d6f040 Imported from capsule None bgruening parents: diff changeset	124 </tool>

Mercurial > repos > bgruening > text_processing

annotate awk.xml @ 0:5314e5d6f040 draft