comparison vsnp_statistics.py @ 25:b908bb18008a draft

Uploaded
author greg
date Thu, 16 Sep 2021 00:56:07 +0000
parents b34843f09f9f
children
comparison
equal deleted inserted replaced
24:39ab5405b509 25:b908bb18008a
62 # Gather Series into a data frame. 62 # Gather Series into a data frame.
63 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) 63 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
64 # Starting at row 3, keep every 4 row 64 # Starting at row 3, keep every 4 row
65 # random sample specified number of rows. 65 # random sample specified number of rows.
66 file_size = nice_size(os.path.getsize(fastq_file)) 66 file_size = nice_size(os.path.getsize(fastq_file))
67 total_reads = int(len(fastq_df.index) / 4) 67 total_reads = len(seqs)
68 # Mean Read Length 68 # Mean Read Length
69 if sampling_size > total_reads: 69 if sampling_size > total_reads:
70 sampling_size = total_reads 70 sampling_size = total_reads
71 fastq_df = fastq_df.iloc[3::4].sample(sampling_size) 71 try:
72 fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
73 except ValueError:
74 fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
72 dict_mean = {} 75 dict_mean = {}
73 list_length = [] 76 list_length = []
74 i = 0 77 i = 0
75 for id, seq, in fastq_df.iterrows(): 78 for id, seq, in fastq_df.iterrows():
76 dict_mean[id] = numpy.mean(letter_annotations[i]) 79 dict_mean[id] = numpy.mean(letter_annotations[i])