import time import re import sys import genome_utils def parse_options(argv): """Parses options from the command line """ from optparse import OptionParser, OptionGroup parser = OptionParser() required = OptionGroup(parser, 'REQUIRED') required.add_option('-i', '--infile', dest='infile', metavar='FILE', help='alignment file in sam format', default='-') required.add_option('-o', '--outfile', dest='outfile', metavar='PATH', help='outfile', default='-') optional = OptionGroup(parser, 'OPTIONAL') optional.add_option('-g', '--genome', dest='genome', metavar='FILE', help='genome config file', default='-') optional.add_option('-O', '--organism', dest='organism', metavar='STRING', help='one of [human|drosophila|elegans], default elegans', default='elegans') optional.add_option('-r', '--readfile', dest='readfile', metavar='FILE', help='file containing all reads in FASTQ format, sorted by read_id', default='-') optional.add_option('-l', '--readlength', dest='readlength', metavar='INT', type='int', help='length of given read [75]', default=75) optional.add_option('-L', '--max_lines', dest='max_lines', metavar='INT', type='int', help='maximum number of lines to be handled [1000000000]', default=1000000000) optional.add_option('-v', '--verbose', dest='verbose', action='store_true', help='verbosity', default=False) optional.add_option('-d', '--debug', dest='debug', action='store_true', help='display debug info', default=False) optional.add_option('-A', '--alioto', dest='alioto', action='store_true', help='switch on special handling of alioto data', default=False) optional.add_option('-W', '--white', dest='white', action='store_true', help='switch on special handling of white data', default=False) optional.add_option('-a', '--append', dest='append', action='store_true', help='append mm flag even if flag is already there', default=False) optional.add_option('-H', '--hard', dest='hard', action='store_true', help='try hard to minimize mismatches', default=False) optional.add_option('-f', '--sanitize_flag', dest='sanitize_flag', action='store_true', help='replace flag by sanitized version', default=False) optional.add_option('-M', '--skip_MT', dest='skip_MT', action='store_true', help='do not sanitize MT DNA', default=False) optional.add_option('-N', '--ignore_Ns', dest='ignore_Ns', action='store_false', help='if set, N in a sequence does not count for the distance', default=True) parser.add_option_group(required) parser.add_option_group(optional) (options, args) = parser.parse_args() if len(argv) < 5: parser.print_help() sys.exit(2) return options def complement(st): mapping = {'a':'t', 'c':'g', 't':'a', 'g':'c', 'n':'n', \ 'A':'T', 'C':'G', 'T':'A', 'G':'C', 'N':'N'} _s = '' for s in st: try: _s += mapping[s] except KeyError: print >> sys.stderr, "Genome sequence not in valid format - letter %s can not be found!" % s sys.exit(2) return _s def ed(s1, s2): """Edit distance for two equally long strings. """ global options if (len(s1) == len(s2)): return sum([(1 - int(s1.upper()[i] == s2.upper()[i] or s1.upper()[i] == 'N' or s2.upper()[i] == 'N')) for i in range(len(s1))]) else: print >> sys.stderr, "strings to compare have not same length! - %s %s" % (s1, s2) sys.exit(2) def minimize_mm(sl, fastq_sl, gen, rev_gen, op, size, options): """Minimizes mismatches""" d1 = 0 d2 = 0 sc1 = 0 sc2 = 0 hc1 = 0 hc2 = 0 len_sl = len(fastq_sl[1]) len_gen = len(gen) if op[0] == 'H': hc1 = size[0] if op[-1] == 'H': hc2 = size[-1] if op[0] == 'S': sc1 = size[0] if op[-1] == 'S': sc2 = size[-1] if options.debug: print gen print rev_gen print fastq_sl[1] print fastq_sl[2] print "" ### first trust flags if len(gen) < options.readlength: gap = options.readlength - len(gen) else: gap = 0 _flag = int(sl[1]) if _flag & 128 == 0: if _flag & 16 == 0: min_d1 = ed(fastq_sl[1][gap + hc1 + sc1:len_sl - hc2 - sc2], gen[sc1:len_gen - sc2]) min_d2 = ed(fastq_sl[1][hc1 + sc1:len_sl - hc2 - sc2 - gap], gen[sc1:len_gen - sc2]) else: min_d1 = ed(fastq_sl[1][gap + hc2 + sc2:len_sl - hc1 - sc1], rev_gen[sc2:len_gen - sc1]) min_d2 = ed(fastq_sl[1][hc2 + sc2:len_sl - hc1 - sc1 - gap], rev_gen[sc2:len_gen - sc1]) min_d_ind = -1 else: if _flag & 16 == 0: min_d1 = ed(fastq_sl[2][gap + hc1 + sc1:len_sl - hc2 - sc2], gen[sc1:len_gen - sc2]) min_d2 = ed(fastq_sl[2][hc1 + sc1:len_sl - hc2 - sc2 - gap], gen[sc1:len_gen - sc2]) else: min_d1 = ed(fastq_sl[2][gap + hc2 + sc2:len_sl - hc1 - sc1], rev_gen[sc2:len_gen - sc1]) min_d2 = ed(fastq_sl[2][hc2 + sc2:len_sl - hc1 - sc1 - gap], rev_gen[sc2:len_gen - sc1]) min_d_ind = -2 min_d = min(min_d1, min_d2) if (min_d == 0 or not options.hard) and gap > 0: if (min_d1 <= min_d2 and _flag & 16 == 16) or (min_d1 > min_d2 and _flag & 16 == 0): return (min_d, min_d_ind, 0, gap) elif (min_d1 > min_d2 and _flag & 16 == 16) or (min_d1 <= min_d2 and _flag & 16 == 0): return (min_d, min_d_ind, gap, 0) if min_d > 0 and options.hard: dist = [options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1, \ options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1, options.readlength + 1] d1 = gap if d1 + d2 + sc1 + sc2 + hc1 + hc2 > 0: dist[0] = ed(fastq_sl[1][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], gen[sc1:len_gen - sc2]) dist[1] = ed(fastq_sl[1][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], rev_gen[sc1:len_gen - sc2]) dist[2] = ed(fastq_sl[1][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], gen[sc2:len_gen - sc1]) dist[3] = ed(fastq_sl[1][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], rev_gen[sc2:len_gen - sc1]) dist[6] = ed(fastq_sl[2][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], gen[sc1:len_gen - sc2]) dist[7] = ed(fastq_sl[2][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], rev_gen[sc1:len_gen - sc2]) dist[8] = ed(fastq_sl[2][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], gen[sc2:len_gen - sc1]) dist[9] = ed(fastq_sl[2][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], rev_gen[sc2:len_gen - sc1]) else: dist[4] = ed(fastq_sl[1], gen) dist[5] = ed(fastq_sl[1], rev_gen) dist[10] = ed(fastq_sl[2], gen) dist[11] = ed(fastq_sl[2], rev_gen) min_d_ind = dist.index(min(dist)) min_d = dist[min_d_ind] if gap > 0: # if min_d_ind in [0, 1, 6, 7]: if min_d_ind in [0, 3, 6, 9]: d1 = gap d2 = 0 elif min_d_ind in [1, 2, 7, 8]: d1 = 0 d2 = gap if min_d > 7: best_min_d = min_d best_min_d_ind = min_d_ind best_d1 = d1 best_d2 = d2 for dd in range(1, (gap + 3)/2): d1 = gap - dd d2 = dd dist[0] = ed(fastq_sl[1][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], gen[sc1:len_gen - sc2]) dist[1] = ed(fastq_sl[1][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], rev_gen[sc1:len_gen - sc2]) dist[2] = ed(fastq_sl[1][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], gen[sc2:len_gen - sc1]) dist[3] = ed(fastq_sl[1][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], rev_gen[sc2:len_gen - sc1]) dist[6] = ed(fastq_sl[2][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], gen[sc1:len_gen - sc2]) dist[7] = ed(fastq_sl[2][d1 + sc1 + hc1:len_sl - d2 - sc2 - hc2], rev_gen[sc1:len_gen - sc2]) dist[8] = ed(fastq_sl[2][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], gen[sc2:len_gen - sc1]) dist[9] = ed(fastq_sl[2][d2 + sc2 + hc2:len_sl - d1 - sc1 - hc1], rev_gen[sc2:len_gen - sc1]) min_d_ind = dist.index(min(dist)) min_d = dist[min_d_ind] if min_d < best_min_d: best_min_d = min_d best_min_d_ind = min_d_ind #if min_d_ind in [0, 1, 6, 7]: if min_d_ind in [0, 3, 6, 9]: best_d1 = d1 best_d2 = d2 elif min_d_ind in [1, 2, 7, 8]: best_d1 = d2 best_d2 = d1 if min_d < 7: break min_d = best_min_d min_d_ind = best_min_d_ind d1 = best_d1 d2 = best_d2 return (min_d, min_d_ind, d1, d2) def main(): """Function parsing, sanitizing and writing sam-file (ensembl)""" options = parse_options(sys.argv) outfile = open(options.outfile, 'w') line_counter = 0 f_line_counter = 0 ### load genome gio = genome_utils.GenomeInfo(options.genome) gio.contig_names.sort() genome = dict() fastq = open(options.readfile, 'r') fastq_sl = fastq.readline().strip().split('\t') converted_aligns = 0 skipped_aligns = 0 time_0 = time.time() time_1 = time_0 for line in open(options.infile, 'r'): line_counter += 1 if line_counter > options.max_lines: break if options.verbose and (line_counter % 10000) == 0: last_time = time_1 time_1 = time.time() print '[ %s (%s skipped / %s converted)]' % (line_counter, skipped_aligns, converted_aligns) print 'took %i secs for last 10000 - took %i secs in total - %i secs on avg' % (time_1 - last_time, time_1 - time_0, (time_1 - time_0)/(max(line_counter / 10000, 1))) if line[0] in ['#', '@']: continue sl = line.strip().split('\t') sl[2] = re.sub('chr', '', sl[2]) sl[2] = re.sub('Cel_', '', sl[2]) sl[2] = re.sub('.1\]', '', sl[2]) if sl[2] in ['M', 'MT', 'mitochondrion_genome']: if options.organism == 'human': sl[2] = 'MT' if options.organism == 'elegans': sl[2] = 'MtDNA' if options.organism == 'drosophila': sl[2] = 'dmel_mitochondrion_genome' ### check for skip-options if options.skip_MT and sl[2] in ['MT', 'MtDNA', 'dmel_mitochondrion_genome']: skipped_aligns += 1 print >> outfile, line, continue ### load genome if necessary if not genome.has_key(sl[2]): if sl[2] in gio.contig_names: if options.verbose: print 'Reading chromosome %s' % sl[2] fgen = open('%s/genome/%s.flat' % (gio.basedir, sl[2])) genome[sl[2]] = fgen.readline() fgen.close() else: print >> sys.stderr, 'Chromosome Names do not match genome information file. Chromosome %s can not be found!' % sl[2] exit(2) (size, op) = (re.split('[^0-9]', sl[5])[:-1], re.split('[0-9]*', sl[5])[1:]) size = [int(i) for i in size] ### reconstruct genome_part from CIGAR if options.white and sl[2] in ['MT', 'MtDNA', 'dmel_mitochondrion_genome']: sl[3] = str(int(sl[3]) - 1) gen_start = int(sl[3]) - 1 gen = '' covered = 0 skipped = 0 deletions = 0 insertions = 0 for o in range(len(op)): if op[o] in ['M', 'S']: gen += genome[sl[2]][gen_start + covered + skipped : gen_start + covered + skipped + size[o]].upper() covered += size[o] elif op[o] == 'I': gen += (size[o] * 'X') ### insertions go into distance #covered += size[o] ### but do not cover genome insertions += size[o] elif op[o] == 'D': skipped += size[o] deletions += size[o] elif op[o] == 'N': skipped += size[o] rev_gen = genome_utils.reverse_complement(gen) if (covered + insertions) != len(gen): print >> sys.stderr, "WARNING: CIGAR length could net be resembled - maybe reached end of chromosome - ignore line: %s \n" % line skipped_aligns += 1 continue if len(gen) > options.readlength: print >> sys.stderr, "WARNING: CIGAR was longer then given readlength - ignore line:\n %s \n" % line skipped_aligns += 1 continue # ### add read string # ### 09 - SEQ while (fastq_sl[0] != sl[0] and len(fastq_sl) > 0 and fastq_sl[0] != ''): fastq_sl = fastq.readline().strip().split('\t') f_line_counter += 1 if options.verbose and (f_line_counter % 10000) == 0: print '[line in pseudofastq: %s]' % (f_line_counter) if (fastq_sl[0] != sl[0]): print >> sys.stderr, 'pseudofastq not sorted or not matching the current infile!' sys.exit(1) (min_d, min_d_ind, d1, d2) = minimize_mm(sl, fastq_sl, gen, rev_gen, op, size, options) assert(d1 + d2 + len(gen) == options.readlength) if len(op) > 1 and min_d > 5: ### reconstruct genome_part from reversed CIGAR gen_start = int(sl[3]) - 1 gen = '' covered = 0 skipped = 0 deletions = 0 insertions = 0 op = op[::-1] size = size[::-1] for o in range(len(op)): if op[o] in ['M', 'S']: gen += genome[sl[2]][gen_start + covered + skipped : gen_start + covered + skipped + size[o]].upper() covered += size[o] elif op[o] == 'I': gen += (size[o] * 'X') ### insertions go into distance #covered += size[o] ### but do not cover genome insertions += size[o] elif op[o] == 'D': skipped += size[o] deletions += size[o] elif op[o] == 'N': skipped += size[o] rev_gen = genome_utils.reverse_complement(gen) (_min_d, _min_d_ind, _d1, _d2) = minimize_mm(sl, fastq_sl, gen, rev_gen, op, size, options) if _min_d < min_d: min_d = _min_d min_d_ind = _min_d_ind _cigar = '' for o in range(len(op)): _cigar += (str(size[o]) + op[o]) sl[5] = _cigar d1 = _d1 d2 = _d2 if options.sanitize_flag: _flag = 1 if int(sl[1]) & 2 == 2: _flag += 2 if min_d_ind > -1: if min_d_ind in [1, 3, 5]: _flag += (64 + 16) elif min_d_ind in [0, 2, 4]: _flag += (64) elif min_d_ind in [7, 9, 11]: _flag += (128 + 16) elif min_d_ind in [6, 8, 10]: _flag += (128) elif min_d_ind == -1: _flag += 64 _flag += (int(sl[1]) & 16) elif min_d_ind == -2: _flag += 128 _flag += (int(sl[1]) & 16) sl[1] = str(_flag) if d1 + d2 > 0: if d1 > 0 and d2 > 0: sl[5] = (str(d1) + 'S' + sl[5] + str(d2) + 'S') sl[3] = str(int(sl[3]) - d1) elif d1 > 0: sl[5] = (str(d1) + 'S' + sl[5]) sl[3] = str(int(sl[3]) - d1) else: sl[5] += (str(d2) + 'S') ### check, if mismatch_info is already available - replace in that case replaced = False if not options.append: for idx in range(10, len(sl)): if sl[idx][:2] == 'NM': sl[idx] = ('NM:i:' + str(min_d + deletions)) replaced = True if not replaced: sl.append('NM:i:' + str(min_d + deletions)) ### rebuild line _line = '' for block in sl: _line += (block + '\t') _line = _line[:-1] converted_aligns += 1 print >> outfile, _line outfile.close() fastq.close() print "\nconverted %s of %s alignments" % (converted_aligns, converted_aligns + skipped_aligns) print 'skipped %s of %s alignments' % (skipped_aligns, converted_aligns + skipped_aligns) if __name__ == '__main__': main()