diff validate_affy_metadata.py @ 0:80d672b3e6dd draft

Uploaded
author greg
date Thu, 15 Aug 2019 13:17:41 -0400
parents
children b5f5c3d0f349
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/validate_affy_metadata.py	Thu Aug 15 13:17:41 2019 -0400
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+"""
+Validate the metadata file associated with Affymetrix 96 well plate data.
+"""
+import argparse
+import datetime
+import decimal
+import re
+import shutil
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input', dest='input', help='Metadata file for Affymetrix 96 well plate data')
+parser.add_argument('--output', dest='output', help='Output dataset'),
+args = parser.parse_args()
+
+EMAIL_MAX_LEN = 255
+VALID_EMAIL_RE = re.compile("[^@]+@[^@]+\.[^@]+")
+
+
+def add_error_msg(accumulated_msgs, msg):
+    return "%s\n%s" % (accumulated_msgs, msg)
+
+
+def empty_value(line_no, label, accumulated_msgs):
+    return add_error_msg(accumulated_msgs, "The required %s value is missing on line %d." % (label, line_no))
+
+
+def stop_error(msg):
+    sys.exit(msg)
+
+
+def string_as_boolean_string(string):
+    if str(string).lower() in ['true', 'yes', 'on', '1']:
+        return 'True'
+    else:
+        return 'False'
+
+
+def validate_date_string(line_no, date_string, column, accumulated_msgs):
+    if len(date_string) == 0:
+        return accumulated_msgs
+    try:
+        datetime.datetime.strptime(date_string, '%Y-%m-%d')
+        return accumulated_msgs
+    except ValueError:
+        return add_error_msg(accumulated_msgs, "Line %d contains an incorrect date format (%s must be YYYY-MM-DD) for column %s." % (line_no, date_string, column))
+
+
+def validate_decimal(line_no, decimal_string, column, accumulated_msgs):
+    try:
+        decimal.Decimal(decimal_string)
+        return accumulated_msgs
+    except Exception:
+        return add_error_msg(accumulated_msgs, "Line %d contains an incorrect decimal value (%s) for column %s." % (line_no, decimal_string, column))
+
+
+def validate_email(line_no, email, accumulated_msgs):
+    if not (VALID_EMAIL_RE.match(email)):
+        return add_error_msg(accumulated_msgs, "Line %d contains an invalid email address (%s).  " % (line_no, email))
+    elif len(email) > EMAIL_MAX_LEN:
+        return add_error_msg(accumulated_msgs, "Line %d contains an email address (%) that is longer than the maximum length, %d characters." % (line_no, email))
+    return accumulated_msgs
+
+
+accumulated_msgs = ""
+# Parse the input file, skipping the header, and validating
+# that each data line consists of 31 comma-separated items.
+with open(args.input, "r") as ih:
+    for i, line in enumerate(ih):
+        if i == 0:
+            # Skip the header.
+            continue
+        # Keep 1-based line value for error messages.
+        line_no = i + 1
+        line = line.rstrip("\r\n")
+        if i > 97:
+            accumulated_msgs = add_error_msg(accumulated_msgs, "The input file contains more than 97 lines (must be 1 header line and no more than 96 data lines).")
+            stop_error(accumulated_msgs)
+        items = line.split("\t")
+        if len(items) != 32:
+            accumulated_msgs = add_error_msg(accumulated_msgs, "Line %d contains %s columns, (must be 32)." % (line_no, len(items)))
+            stop_error(accumulated_msgs)
+        # Required and validated.
+        # Required.
+        user_specimen_id = items[0]
+        if len(user_specimen_id) == 0:
+            accumulated_msgs = empty_value(line_no, "user_specimen_id", accumulated_msgs)
+        # Optional.
+        field_call = items[1]
+        # Optional.
+        bcoral_genet_id = items[2]
+        # Optional.
+        bsym_genet_id = items[3]
+        # Required.
+        reef = items[4]
+        if len(reef) == 0:
+            accumulated_msgs = empty_value(line_no, "reef", accumulated_msgs)
+        # Required.
+        region = items[5]
+        if len(region) == 0:
+            accumulated_msgs = empty_value(line_no, "region", accumulated_msgs)
+        # Required and validated.
+        latitude = items[6]
+        accumulated_msgs = validate_decimal(line_no, latitude, "latitude", accumulated_msgs)
+        # Required and validated.
+        longitude = items[7]
+        accumulated_msgs = validate_decimal(line_no, longitude, "longitude", accumulated_msgs)
+        # Optional.
+        geographic_origin = items[8]
+        # Optional.
+        colony_location = items[9]
+        # Optional.
+        depth = items[10]
+        # Optional.
+        disease_resist = items[11]
+        # Optional.
+        bleach_resist = items[12]
+        # Optional.
+        mortality = items[13]
+        # Optional.
+        tle = items[14]
+        # Optional.
+        spawning = string_as_boolean_string(items[15])
+        # Required.
+        collector_last_name = items[16]
+        if len(collector_last_name) == 0:
+            accumulated_msgs = empty_value(line_no, "collector_last_name", accumulated_msgs)
+        # Required.
+        collector_first_name = items[17]
+        if len(collector_first_name) == 0:
+            accumulated_msgs = empty_value(line_no, "collector_first_name", accumulated_msgs)
+        # Required.
+        org = items[18]
+        if len(org) == 0:
+            accumulated_msgs = empty_value(line_no, "org", accumulated_msgs)
+        # Required and validated.
+        collection_date = items[19]
+        accumulated_msgs = validate_date_string(line_no, collection_date, "collection_date", accumulated_msgs)
+        # Required and validated.
+        contact_email = items[20]
+        accumulated_msgs = validate_email(line_no, contact_email, accumulated_msgs)
+        # Required.
+        seq_facility = items[21]
+        if len(seq_facility) == 0:
+            accumulated_msgs = empty_value(line_no, "seq_facility", accumulated_msgs)
+        # Optional.
+        array_version = items[22]
+        # Optional.
+        public = string_as_boolean_string(items[23])
+        # Optional.
+        public_after_date = items[24]
+        accumulated_msga = validate_date_string(line_no, public_after_date, "public_after_date", accumulated_msgs)
+        # Required and validated.
+        sperm_motility = items[25]
+        accumulated_msgs = validate_decimal(line_no, sperm_motility, "sperm_motility", accumulated_msgs)
+        # Required and validated.
+        healing_time = items[26]
+        accumulated_msgs = validate_decimal(line_no, healing_time, "healing_time", accumulated_msgs)
+        # Optional.
+        dna_extraction_method = items[27]
+        # Optional.
+        dna_concentration = items[28]
+        # If dna_concentration has a value, then it must be decimal.
+        if len(dna_concentration) > 0:
+            accumulated_msgs = validate_decimal(line_no, dna_concentration, "dna_concentration", accumulated_msgs)
+        # Optional.
+        registry_id = items[29]
+        # Optional.
+        result_folder_name = items[30]
+        # Optional.
+        plate_barcode = items[31]
+
+
+if len(accumulated_msgs) > 0:
+    stop_error(accumulated_msgs)
+
+shutil.copyfile(args.input, args.output)