#!/usr/bin/env python
import bz2
import ftplib
import gzip
import json
import optparse
import os
import re
import shutil
import subprocess
import sys
import tempfile
import urllib.parse
import urllib.request
import zipfile
from binascii import hexlify

CHUNK_SIZE = 2**20

DEFAULT_DATA_TABLE_NAME = "indexed_maf_files"

# Nice solution to opening compressed files (zip/bz2/gz) transparently
# https://stackoverflow.com/a/13045892/638445

class CompressedFile(object):
    magic = None
    file_type = None
    mime_type = None
    proper_extension = None

    def __init__(self, f):
        # f is an open file or file like object
        self.f = f
        self.accessor = self.open()

    @classmethod
    def is_magic(self, data):
        print('%s: %s (m) <=> %s (f)' % (self.file_type, hexlify(self.magic).decode(), hexlify(data[:4]).decode()))
        return hexlify(data).startswith(hexlify(self.magic))

    def open(self):
        return None


class ZIPFile(CompressedFile):
    magic = b'\x50\x4b\x03\x04'
    file_type = 'zip'
    mime_type = 'compressed/zip'

    def open(self):
        return zipfile.ZipFile(self.f)


class BZ2File(CompressedFile):
    magic = b'\x42\x5a\x68'
    file_type = 'bz2'
    mime_type = 'compressed/bz2'

    def open(self):
        return bz2.BZ2File(self.f)


class GZFile(CompressedFile):
    magic = b'\x1f\x8b\x08'
    file_type = 'gz'
    mime_type = 'compressed/gz'

    def open(self):
        return gzip.GzipFile(self.f)


# factory function to create a suitable instance for accessing files
def get_compressed_file(filename):
    with open(filename, 'rb') as f:
        print('seek: %d' % f.tell())
        start_of_file = f.read(16)
        f.seek(0)
        for cls in (ZIPFile, BZ2File, GZFile):
            if cls.is_magic(start_of_file):
                f.close()
                return cls(filename)

        return None

def url_download(url, tmp=False, localpath=None):
    """Attempt to download gene annotation file from a given url
    :param url: full url to gene annotation file
    :type url: str.
    :returns: name of downloaded gene annotation file
    :raises: ContentDecodingError, IOError
    """

    # Generate file_name
    file_name = url.split('/')[-1]
    if tmp:
        file_name = os.path.join(tempfile.mkdtemp(), file_name)
    elif localpath is not None:
        file_name = os.path.join(localpath, file_name)

    try:
        # download URL (FTP and HTTP work, probably local and data too)
        urllib.request.urlretrieve(url, file_name)

        # uncompress file if needed
        cf = get_compressed_file(file_name)
        print('cf: %s' % str(cf))
        if cf is not None:
            uncompressed_file_name = os.path.splitext(file_name)[0]
            with open(uncompressed_file_name, 'wb') as uncompressed_file:
                shutil.copyfileobj(cf.accessor, uncompressed_file)
            os.remove(file_name)
            file_name = uncompressed_file_name
        print(file_name)
    except IOError as e:
        sys.stderr.write('Error occured downloading reference file: %s' % e)
        os.remove(file_name)
    return file_name


def generate_metadata(params, options):
    #3-way multiZ (hg18,panTro2,rheMac2)    3_WAY_MULTIZ_hg18   hg18,panTro2,rheMac2    hg18,panTro2,rheMac2    /PATH/chrM.maf,/PATH/chrX.maf,/PATH/chrY.maf
    name = options.name
    uid = name
    species = []
    # Found to be the fastest way to strip non-alphanumeric characters
    # from a string by some person on StackOverflow
    pattern = re.compile('[\W]+')
    uid = pattern.sub('_', uid).strip('_')
    url = options.nexus
    with open(url_download(url, True), 'r') as fh:
        species = [line.strip(' (),').split(':')[0] for line in fh.readlines()]
    return name, uid.upper(), species

def get_maf_listing(maf_path):
    maf_files = []
    maf_url = urllib.parse.urlparse(maf_path)
    f = ftplib.FTP()
    f.connect(maf_url.netloc)
    f.login()
    listing = f.mlsd(maf_url.path)
    # print('\n'.join(['%s\t%s' % (name, facts['size']) for name, facts in listing]))
    # exit(0)
    for name, facts in listing:
        # Remove this before PR
        if int(facts['size']) > 74007:
            continue
        if not name.endswith('gz'):
            continue
        if facts['type'] != 'file':
            continue
        skip = False
        for compression in ['gz', 'bz2', 'zip']:
            for exclusion in ['_alt', '_random']:
                if name.endswith('%s.maf.%s' % (exclusion, compression)):
                    skip = True
                    break
        if skip:
            continue
        # if name.startswith('chrUn'):
        #     continue
        maf_files.append(urllib.parse.urljoin(maf_path, name))
    f.close()
    return maf_files


def index_maf_files(maf_files, maf_path, options, params, target_directory):
    maf_paths = []
    for maf_file in maf_files:
        maf_url = urllib.parse.urljoin(maf_path, maf_file)
        local_maf = url_download(maf_url, target_directory)
        print(os.path.realpath(local_maf))
        # exit(0)
        index_command = ['maf_build_index.py', local_maf, local_maf + '.index']
        executor = subprocess.Popen(index_command)
        stdout, stderr = executor.communicate()
        print(stderr, file=sys.stderr)


def main():
    parser = optparse.OptionParser()
    parser.add_option('-x', '--nexus', dest='nexus', action='store', type='string', help='URL for .nh')
    parser.add_option('-a', '--alignments', dest='alignments', action='store', type='string', help='URL for alignments')
    parser.add_option('-n', '--name', dest='name', action='store', type='string', help='Name')
    parser.add_option('-o', '--output', dest='output', action='store', type='string', help='Output')
    (options, args) = parser.parse_args()

    params = {}

    # with open(options.output) as fh:
    #     params = json.load(fh)
    # target_directory = params['output_data'][0]['extra_files_path']
    target_directory = os.getcwd()
    # if not os.path.exists(target_directory):
    #     os.mkdir(target_directory)

    display_name, uid, species_list = generate_metadata(params, options)
    print('%s %s %s' % (display_name, uid, ','.join(species_list)))
    maf_path = urllib.parse.urljoin(options.nexus, 'maf/')
    maf_files = get_maf_listing(maf_path)

    data_manager_entry = {
        'data_tables': {
            'indexed_maf_files': {
                'display_name': display_name,
                'value': uid,
                'indexed_for': ','.join(species_list),
                'exists_in_maf': ','.join(species_list),
                'path': ','.join([os.path.join(target_directory, maf_file.split('/')[-1]) for maf_file in maf_files])
            }
        }
    }

    # Fetch the MAFs
    index_maf_files(maf_files, maf_path, options, params, target_directory)

    print(json.dumps(data_manager_entry))

    # with open(options.output, 'wb') as fh:
    #     fh.write(json.dumps(data_manager_entry))
    # open(filename, 'wb').write(json.dumps(data_manager_entry))


if __name__ == "__main__":
    main()
