PYTHON Design Script v0.1

import subprocess
import os
import itertools
import math
from colorama import *
init(autoreset=True)

def ParseNUPACK_Output(inp):
    assert os.path.isfile(inp)
    allseqs = []
    for line in open(inp).readlines():
        if not line.startswith('        sequence'): continue
        l = line.split(':')[1].strip().split('+')
        allseqs.extend( l )
    defects = []
    for line in open(inp).readlines():
	if not line.startswith('        defect[nt]'): continue
        l = line.split(':')[1].strip().split('+')
        defects.extend( l )
    normdefects = []
    for line in open(inp).readlines():
        if not line.startswith('        normalized defect'): continue
        l = line.split(':')[1].strip().split('+')
        normdefects.extend( l )
    probability = []
    for line in open(inp).readlines():
        if not line.startswith('        target probability'): continue
        l = line.split(':')[1].strip().split('+')
        probability.extend( l )
    target = []
    for line in open(name + '_0.npo').readlines():
        if not line.startswith('        structure'): continue
        l = line.split(':')[1].strip().split(' ')
        target.extend( l )
    return allseqs, defects, normdefects, probability, target

def print_target_energy(seqs, inp, Egap=1.0):
    assert os.path.isfile(name + '_0.npo')
    target = []
    for line in open(name + '_0.npo').readlines():
        if not line.startswith('        structure'): continue
        l = line.split(':')[1].strip().split(' ')
        target.append( l )
    test = target[0][0]
    open(name + '.in','w').write(NUPACK_script_subopt_prob(seqs, test, Egap=Egap))
    DEVNULL = open(os.devnull, 'wb')
    cmd = 'energy -multi -material dna %s > energy.txt' % name
    proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
    proc.communicate()
    assert os.path.isfile('energy.txt')
    fp = open("energy.txt")
    lines = fp.readlines()
    pre_energy = lines[18]
    energy = float(pre_energy)
    
    return test, energy

def ParseNUPACK_SubOptOutput(seqs, inp, Egap):
    test, energy = print_target_energy(seqs, inp, Egap=Egap)
    print 'First, the target structure and its energy:'
    print '\n%6.3f %s' % (energy, test)
    print ' '
    print 'Now all other secondary structures within specified Egap of the MFE.'
    print 'Note that the first structure shown is the lowest energy (mfe) structure.'
    ss2E = {}
    marker = '% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %'
    instr = open(inp + '.subopt').read()
    problist = []
    for chunk in instr.split(marker)[1:-1]:
        chunklines = chunk.split('\n')
        if len(chunklines) probs.txt' % name
	proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
	proc.communicate()
	assert os.path.isfile('probs.txt')
	fp = open("probs.txt")
	lines = fp.readlines()
	pre_prob = lines[14]
	prob = float(pre_prob)
	assert os.path.isfile(name + '.subopt')
	print '\n%6.3f %s %0.6f' % (E, ''.join(test_strand_true), prob)
        problist.append(prob)

    return ss2E

def Parse_mfe(inp):
    assert os.path.isfile(inp + '.mfe')
    marker = '% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %'
    instr = open(inp + '.mfe').read()
    ss2E = {}
    for chunk in instr.split(marker)[1:-1]:
        chunklines = chunk.split('\n')
        if len(chunklines)<4: continue
        E = float(chunklines[2])
        ss = chunklines[3]
        ss2E[ss] = E
        test_strand = ss
    print 'Secondly, the mfe and its structure:'
    print '\n%6.3f %s' % (E, test_strand)


def Parse_complexes_mfe(inp):
    assert os.path.isfile(inp + '.ocx-mfe')
    marker = '% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %'
    instr = open(inp + '.ocx-mfe').read()
    ss2E = {}
    for chunk in instr.split(marker)[1:-1]:
        chunklines = chunk.split('\n')
        if len(chunklines)<4: continue
        E = float(chunklines[3])
        ss = chunklines[4]
        ss2E[ss] = E
        test_strand = ss
        print '\n%6.3f %s ' % (E, test_strand)

def Parse_complexes_mfe_energy(seqs, inp, Egap=1.0):
    assert os.path.isfile(inp + '.ocx-mfe')
    test, energy = print_target_energy(seqs, inp, Egap=1.0)
    marker = '% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %'
    instr = open(inp + '.ocx-mfe').read()
    ss2E = {}
    for chunk in instr.split(marker)[1:-1]:
        chunklines = chunk.split('\n')
        if len(chunklines)<4: continue
        E = float(chunklines[3])
        id = chunklines[1]
        ss = chunklines[4]
        ss2E[ss] = E
        test_strand = ss
	if E  0 and domains[i][0] != domains[i-1][0]:  ## New Strand
            strands.append(strand)
            strand = ''
        strand += sym * num
    strands.append(strand)
    return '+'.join(strands)

def Design2nny(segs, T, no_repeats=True, name='temp'):
    open(name + '.np','w').write(NUPACK_script_2nny_design(segs, T, no_repeats=no_repeats))
    DEVNULL = open(os.devnull, 'wb')
    cmd = 'multitubedesign %s' % name
    proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
    proc.communicate()
    assert os.path.isfile(name + '_0.npo')
    allseqs, defs, ndef, prb, target = ParseNUPACK_Output(name + '_0.npo')
    targ = target[0]
    return allseqs, defs, ndef, prb, targ

def read_seq_spec(seq):
    if set(seq).issubset('ATCGRYSWKMBDHVN'):
        tmp = seq.replace(' ','')
        return tmp, len(tmp)
    import re
    str_parts = re.split('\d+', seq)
    num_parts = map(int,re.findall('\d+', seq))
    print str_parts
    print num_parts
    newseq = ''
    for i, str_part in enumerate(str_parts[:-1]):
        print str_part
        newseq += str_part
        if num_parts[i] 0:
		comp.extend( [i+1] * strand_pop[i])
	    complexes.append(comp)
    out = open(outfile,'w')
    for comp in complexes:
	print >>out, ' '.join(map(str,comp))
    """
    os.system('python gen_all_on_pathway_complexes.py')
    assert os.path.isfile(name + '.list')
    order = math.ceil(len(seqs)/2)
    open(name + '.in','w').write(NUPACK_script_complexes(seqs, order=order))
    DEVNULL = open(os.devnull, 'wb')
    assert os.path.isfile(name + '.in')
    cmd = 'complexes.exe -mfe %s' % name
    proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
    proc.communicate()
    assert os.path.isfile(name + '.ocx-mfe')
    Parse_complexes_mfe_energy(seqs, name, Egap=Egap)

def NUPACK_script_basic2(seqs):
    """Input format for pfunc, pairs, mfe"""
    uniq_seqs = set(seqs)
    outstr = '%d\n' % len(uniq_seqs)
    for seq in seqs:
        outstr += seq + '\n'
    #outstr += ' '.join(['1']*len(seqs))
    outstr += ' '.join(map(str, range(1, len(seqs)+1)))
    return outstr

def NUPACK_script_subopt(seqs, Egap=1.0):
    """Input format for subopt"""
    uniq_seqs = set(seqs)
    outstr = '%d\n' % len(uniq_seqs)
    for seq in seqs:
        outstr += seq + '\n'
    outstr += ' '.join(map(str, range(1, len(seqs)+1)))
    outstr += '\n%.1f\n' % Egap
    return outstr

def NUPACK_script_subopt_prob(seqs, struc, Egap=1.0):
    """Input format for subopt with secondary structure"""
    uniq_seqs = set(seqs)
    outstr = '%d\n' % len(uniq_seqs)
    for seq in seqs:
	outstr += seq + '\n'
    outstr += ' '.join(map(str, range(1, len(seqs)+1))) + '\n'
    outstr += struc
    outstr += '\n%.1f\n' % Egap
    return outstr

def RunMFE(seqs, T, name='temp_complexes', order=2):
    open(name + '.in','w').write(NUPACK_script_basic2(seqs))
    DEVNULL = open(os.devnull, 'wb')
    cmd = 'mfe -T %.1f -multi -material dna %s' % (T, name)
    proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
    proc.communicate()
    assert os.path.isfile(name + '.mfe')

def RunSubOpt(seqs, T, name='temp_complexes', Egap=1.0):
    print 'Running subopt with T = %.1f and energy gap (Egap) = %.1f' % (T, Egap)
    print ' '
    open(name + '.in','w').write(NUPACK_script_subopt(seqs, Egap=Egap))
    DEVNULL = open(os.devnull, 'wb')
    cmd = 'subopt -T %.1f -multi -material dna %s' % (T, name)
    proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=DEVNULL)
    proc.communicate()
    assert os.path.isfile(name + '.subopt')
    ParseNUPACK_SubOptOutput(seqs, name, Egap)

def MultiTubeDefect(segs, seqs, T, name='temp'):
    noknot = ''.join(zip(*segs)[0]).replace('*','.')
    domains = list(zip(*segs)[1])
    lengths = [read_seq_spec(x)[1] for x in zip(*segs)[2]]
    outstr = 'material = dna\n'
    outstr += 'temperature = %.1f\n' % T
    outstr += SpecifiedDomainStr(segs, seqs)
    domains = list(zip(*segs)[1])
    outstr += GetStrandDomainStr(domains)
    raw_strands = [x[0].upper() for x in zip(*segs)[1]]
    seen = set()
    strands = [x for x in raw_strands if x not in seen and not seen.add(x)]
    outstr += 'complex %s = %s\n' % (name, ' '.join(strands))
    outstr += name + '.structure = ' + TileDotParen(domains, lengths, noknot) + '\n\n'
    outstr += 'tube maintube = '+name + '\n'
    outstr += 'maintube.%s.conc = 1.0e-6\n' % name
    outstr += 'maintube.offtargets = {maxsize = 3}\n'
    return outstr

def parse_multitubedefect_out(outfile):
    off = False
    condition1, condition2 = False, False
    for line in open(outfile).readlines():
        if off: continue
        if line.startswith('    temperature[K]:'): T = float(line.split()[-1])
        if line.startswith('physical results:'): condition1 = True
        if not condition1: continue
        l = line.split()
        if l[0] == 'tubes:': condition2 = True
        if not condition2: continue
        if l[0] == 'concentration:':  y = float(l[1])
        if l[0] == 'target' and l[1] == 'concentration:':  tot = float(l[-1])
        if line.startswith('mean objective:'): off = True
    return T, y/tot

def intersperse(lst, item):
    result = [item] * (len(lst) * 2 - 1)
    result[0::2] = lst
    return result

def target_color(seqs, targ):
    list1 = targ
    list2_pre = intersperse(seqs, '+')
    list2 = ''.join(list2_pre)
    targ_color = []
    for let1, let2 in zip(list1, list2):
	if let1 == '+':
	    let1 = Fore.WHITE+let1
	elif let2 == 'T':
	    let1 = Fore.RED+let1
	elif let2 == 'G':
	    let1 = Fore.YELLOW+let1
	elif let2 == 'C':
	    let1 = Fore.BLUE+let1
	elif let2 == 'A':
	    let1 = Fore.GREEN+let1
	targ_color.append(let1)
    print 'Below is the target structure color-coded by base:'
    print 'BLUE is Cystosine (C)'
    print 'YELLOW is Guanine (G)'
    print 'RED is Thymine (T)'
    print 'GREEN is Adenine (A)'
    print ' '
    print ''.join(targ_color)
    return targ_color

#+------------------------------------+
#|         Begin Editing Here         |
#+------------------------------------+

if __name__ == '__main__':
    T = 37.0                            #The temperature to design and analyze at (degC)
    name = 'tiledesign11'             #This is the name for the generated NUPACK files

    ##"segs" is a tuple where domains are defined. A baseline example is provided below. So long as the general format is followed, domains can be added/removed as needed.
    ##The first column indicates whether the associated bases are opening "(", closing ")", or unpaired ".". This should have been determined when the NUPACK Design Guidelines.
    ##The second columns is for the name of the domain. Typically, a simple letter-number scheme is used. As before, this was established through following the Guidelines.
    ##The third column is where you list the bases in that domain. Make sure your count is accurate as discrepencies will prevent the rest of the script from functioning.
    ##Note that bases must be listed from the 5' to the 3' end!

    segs = [['.','a0','T'],
            ['(','a1','TNNNNNNCAGGAAGCACTTCCT'],
            ['(','a2','GSSSS'],
            ['(','a3','NTAATTNNNNNN'],
            ['*','b0','SSSSS'],
            ['(','b1','CAGGAAGCACTTCCTGNNNNNTAA'],
            ['.','c0','A'],
            [')','c1','TTANNNNNCAGGAAGTGCTTCCTG'],
            [')','c2','NNNNNNAATTAN'],
            [')','f0','SSSSC'],
            ['(','f1','AGGAAGTGCTTCCTGNNNNNNA'],
            ['.','g0','T'],
            [')','g1','TNNNNNNCAGGAAGCACTTCCT'],
            ['(','g2','GNNNNNTAATTN'],
            ['(','h0','CAGGAAGCACTTCCTGNNNNNTAA'],
            ['.','e0','A'],
            [')','e1','TTANNNNNCAGGAAGTGCTTCCTG'],
            ['*','e2','SSSSS'],
            [')','e3','NAATTANNNNNC'],
            [')','d0','AGGAAGTGCTTCCTGNNNNNNA']]

    strands = ['Strand A',           #Here, list the strands by letter in the order they were entered into "segs."
               'Strand B',
               'Strand C',
               'Strand F',
               'Strand G',
               'Strand H',
               'Strand E',
               'Strand D']

    seqs, defs, ndefs, probs, targ = Design2nny(segs, T, name=name)
    print 'Finished design! HUZZAH!!!'
    print ' '
    print 'Tile Design/Name: %s' % name
    print ' '
    print 'Target Stucture: %s' % targ
    print 'Normalized Defect: %3.6f' % float(ndefs[0])
    print 'Target Probability: %3.6f' % float(probs[0])
    print ' '
    for seq, strand in zip(seqs, strands):
        list = read_seq_spec(seq)
        print '%s: %s with %3.0f bases.' % (strand, list[0], list[1])
    print ' '
    target_color(seqs, targ)
    print ' '

#+------------------------------------+
#|   Complex .in File and Evaluation  |
#+------------------------------------+

    #The below are for if you want to get a .in file that lists the strands.
    #The order must be assigned and equates to the maximum number of strands allowed to be in a complex.

    name2 = 'design_cmplx'
    #open(name2 + '.in','w').write(NUPACK_script_complexes(seqs, order = 2))

    #The below are for evaluating the complexes accoridng to NUPACK's complexes command.
    #This calculates the partition function and equilibrium base pairing for each possible complex in a test tube.
    #For a complete analysis of possible tile structures, said order = number of unique strands.
    #Otherwise, a limited selection can be used by setting order =24 hours.
    
    name3 = 'design_cmplx_part'
    #EvaluateComplexes_full(seqs, name=name2, order=2, Egap = 1.0)
    #EvaluateComplexes_partial(seqs, name=name2, Egap=1.0)
    
#+------------------------------------+
#|       Minimum Free Energy          |
#+------------------------------------+
    
    #Calculates the secondary structure that has the minimum free energy possible for the designed system.
    #Output file is a .mfe with the same name as the one specified in the design script.

    RunMFE(seqs, T, name=name)

#+------------------------------------+
#|       SubOpt Functionality         |
#+------------------------------------+

    RunSubOpt(seqs, T, name=name, Egap=1.0)


Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s