TargetRank

""" Implementation of TargetRank rules """ import sys, os, cPickle, smtplib, getopt # GLOBALS PATH = '/Users/cydneyn/localData/targetRank/' ORGs = {'Human': 'hg18', 'Mouse': 'mm8'} # random siRNA subset derived parameters SEED_WEIGHTS = {'t2t7': {'A': 0.04, 'T':0.04, 'C': 0.04, 'G':0.04, 'all': 0.04}, 't1At7': {'A': 0.11, 'T':0.11, 'C': 0.11, 'G':0.11,'all': 0.11}, 't2t8': {'A': 0.15, 'T':0.15, 'C': 0.08, 'G':0.08, 'all': 0.12}, 't1At8': {'A': 0.25, 'T':0.25, 'C': 0.17, 'G':0.17, 'all': 0.21}} # controlling only seedType and t9 (and upAU for upCon case) BIN_LFCs = {'upCon': {'1':0.085, '2': 0.135, '3': 0.163}, 'downAU': {'1':0.083, '2': 0.126, '3': 0.182}} nLCF_MEANS = {'upCon': 0.128, 'downAU': 0.130} BIN_CUTOFFS = {'upCon': {'1': (0.0, 0.32), '2': (0.33,0.56), '3': (0.57,1,00)}, 'downAU': {'1': (0.0, 0.52), '2': (0.53,0.66), '3': (0.67,1,00)}} MIN_SCORE = 0.2 HTML_HEADER = """ \ TargetRank """ s += HTML_FOOTER return s def targetRank(seqs, org): targetsBySM, allSeedMatches = {}, {} for seq in seqs: # clear off any whitespace seq = seq.strip().replace("\'",'').replace('\"', '') # error if unexpected letters for nt in seq: if nt.upper() not in ['A','C', 'G','T','U']: error = HTML_HEADER error += """

""" def reverseComplement(seq): seq = seq.upper() seq = seq.replace('U', 'T') rcSeq = [] for nt in seq: if nt == 'A': rcSeq.append('T') elif nt == 'T': rcSeq.append('A') elif nt == 'C': rcSeq.append('G') elif nt == 'G': rcSeq.append('C') rcSeq.reverse() rcSeq = ''.join(rcSeq) if len(rcSeq) != len(seq): print "Incomplete reverse complement conversion" sys.exit() return rcSeq def possibleSeedMatches(smRegion): seedMatches = {} # make possible 6mers seedMatch = smRegion[1:7] for t9 in ['A','C','G','T']: for t8 in ['A','C','G','T']: if t8 == smRegion[0]: continue for t1 in ['C','G','T']: seedMatches[t9 + t8 + seedMatch + t1] = 't2t7' # make possible A1 7mers for t9 in ['A','C','G','T']: for t8 in ['A','C','G','T']: if t8 == smRegion[0]: continue seedMatches[t9 + t8 + seedMatch + 'A'] = 't1At7' # make possible M8 7mers for t9 in ['A','C','G','T']: for t1 in ['C','G','T']: seedMatches[t9 + smRegion[0] + seedMatch + t1] = 't2t8' # make possible 8mers for t9 in ['A','C','G','T']: seedMatches[t9 + smRegion[0] + seedMatch + 'A'] = 't1At8' # for sm in seedMatches: # print sm, seedMatches[sm] # print len(seedMatches), "possible seed matches generated" return seedMatches def parseSiInput(): opts,args = getopt.getopt(sys.argv[1:], "", ["org=", "si=", "miRname="]) if not opts: print "Need to provide command-line options:" print "--org and --si or --miRname" sys.exit() inputOrg, si, miRname = None, None, None for o,a in opts: if o in ("--org"): inputOrg = a if o in ("--si"): si = a if o in ("--miRname"): miRname = a # handle an empty form - org will default to 'Human' if not si and not miRname: print "No input provided" sys.exit() # handle dual input case if si and miRname: print "Both a miRNA/siRNA sequence and a miRBase miRNA name were submitted. Please input just one or the other." sys.exit() # handle miRNA name input if not si and miRname: inputOrg, siSeqs, error = convertNameToSeqAndOrg(miRname) if error: print error sys.exit() elif si and not miRname: siSeqs = si.split(',') org = ORGs[inputOrg] return siSeqs, org def retrieveTargets(path, org, word): fn = path + org + '/wordIndices/' + word[0] + '/' + word + '_index.pickle' if not os.path.exists(fn): return None fh = open(fn, 'rb') targets = cPickle.load(fh) fh.close() # print len(targets), "targets retrieved" return targets def retrieveGeneIDs(path, org): fn = path + org + '/geneNames.pickle' fh = open(fn, 'rb') geneIDs = cPickle.load(fh) fh.close() return geneIDs def scoreTarget(sm, info): complete = True # sm[0] is the t9 position seedType = info['seedType'] score = SEED_WEIGHTS[seedType][sm[0]] if seedType in ['t1At7', 't2t8', 't1At8']: # upstream conservation upCon = info['upCon'] if upCon == 'NA': upConScore = 0.0 complete = False else: if upCon <= BIN_CUTOFFS['upCon']['1'][-1]: bin = '1' elif upCon >= BIN_CUTOFFS['upCon']['3'][0]: bin = '3' else: bin = '2' upConScore = BIN_LFCs['upCon'][bin] \ - nLCF_MEANS['upCon'] # downstream AU content downAU = info['downAU'] if downAU == 'NA': downAUScore = 0.0 complete = False else: if downAU <= BIN_CUTOFFS['downAU']['1'][-1]: bin = '1' elif downAU >= BIN_CUTOFFS['downAU']['3'][0]: bin = '3' else: bin = '2' downAUScore = BIN_LFCs['downAU'][bin] \ - nLCF_MEANS['downAU'] score += upConScore + downAUScore return score, complete def scoreAllTargets(seedMatches, targetsBySM): scoredTargets = {} # determine if all seed matches have a single seed match type mixed = False for sm in seedMatches: if len(seedMatches[sm]) > 1: mixed = True break for sm in targetsBySM: # print '\n', sm, seedMatches[sm] for id in targetsBySM[sm]: # print id scoredTargets.setdefault(id, {'name': None, 'product': None, 'score': 0.0, 'complete': True, 'total': 0, 'mixed': mixed}) # if some mixed type, set all type counts to 'NA' for type in ['con_t2t7', 'con_t1At7', 'con_t2t8', 'con_t1At8', 'nc_t2t7', 'nc_t1At7', 'nc_t2t8', 'nc_t1At8']: if mixed: scoredTargets[id][type] = 'NA' else: scoredTargets[id].setdefault(type, 0) # score each match smScore = 0.0 for match in targetsBySM[sm][id]: # print match score,complete = scoreTarget(sm, match) # print 'single match score:', score smScore += score if not complete: scoredTargets[id]['complete'] = False scoredTargets[id]['name'] = match['name'] scoredTargets[id]['product'] = match['product'] # only record conservation type if single seed match type if not mixed: if seedMatches[sm][0] == 't2t7': if match['con_m2m7'] == 1: scoredTargets[id]['con_t2t7'] += 1 else: scoredTargets[id]['nc_t2t7'] += 1 elif seedMatches[sm][0] == 't1At7': if match['con_m1m7'] == 1: scoredTargets[id]['con_t1At7'] += 1 else: scoredTargets[id]['nc_t1At7'] += 1 elif seedMatches[sm][0] == 't2t8': if match['con_m2m8'] == 1: scoredTargets[id]['con_t2t8'] += 1 else: scoredTargets[id]['nc_t2t8'] += 1 elif seedMatches[sm][0] == 't1At8': if match['con_m1m8'] == 1: scoredTargets[id]['con_t1At8'] += 1 else: scoredTargets[id]['nc_t1At8'] += 1 # correct for multiple seed match types smScore = smScore/len(seedMatches[sm]) # print "seed match score:", smScore scoredTargets[id]['score'] += smScore # print scoredTargets[id]['score'] scoredTargets[id]['total'] += len(targetsBySM[sm][id])/len(seedMatches[sm]) return scoredTargets def outputTargetsAsText(siSeqs, inputOrg): # org = ORGs[inputOrg] # siSeqs = siSeqs.split(',') targets, error = targetRank(siSeqs, inputOrg) if error: return error targetIDs = targets.keys() targetIDs.sort(lambda x,y: cmp(targets[x]['score'], targets[y]['score'])) targetIDs.reverse() seedTypes = ['con_t1At8', 'con_t2t8', 'con_t1At7', 'con_t2t7', 'nc_t1At8', 'nc_t2t8', 'nc_t1At7', 'nc_t2t7'] s = 'Rank, Gene Name, Gene/Isoform Description, Refseq ID, TargetRank Score, ' s += 'Conserved Seed Matches: 8mer, M8 7mer, A1 7mer, 6mer, ' s += 'Non-conserved Seed Matches: 8mer, M8 7mer, A1 7mer, 6mer\n' sameRankCount = 1 rank = 1 prevScore = targets[targetIDs[0]]['score'] for i,id in enumerate(targetIDs): score = targets[id]['score'] # filter for minimum score if score < MIN_SCORE: continue if score != prevScore: rank += sameRankCount sameRankCount = 1 prevScore = targets[id]['score'] elif i != 0: sameRankCount += 1 firstID = id.split('|')[0] s += "%s, " %rank s += "%s, " %targets[id]['name'] s += "%s, " %targets[id]['product'].replace(',', ' -') s += "%s, " %firstID s += "%.4f, " %targets[id]['score'] s += "%s, " %targets[id]['total'] s += "%s, " %targets[id]['con_t1At8'] s += "%s, " %targets[id]['con_t2t8'] s += "%s, " %targets[id]['con_t1At7'] s += "%s, " %targets[id]['con_t2t7'] s += "%s, " %targets[id]['nc_t1At8'] s += "%s, " %targets[id]['nc_t2t8'] s += "%s, " %targets[id]['nc_t1At7'] s += "%s\n" %targets[id]['nc_t2t7'] return s def convertNameToSeqAndOrg(miRnames): inputOrg, seqs, error = None, [], None miRnames = miRnames.split(',') for miRname in miRnames: miRname = miRname.lower().strip().replace("\'", '').replace('\"', '') # deduce organism if miRname.startswith('hsa'): org = 'hg18' if inputOrg and inputOrg != 'Human': error = HTML_HEADER error += """""" error += """""" error += """

Input miRNA names must all be from the same organism (i.e. all start with hsa- or mmu-).

""" error += HTML_FOOTER return None, None, error inputOrg = 'Human' elif miRname.startswith('mmu'): org = 'mm8' if inputOrg and inputOrg != 'Mouse': error = HTML_HEADER error += """""" error += """""" error += """

Input miRNA names must all be from the same organism (i.e. all start with hsa- or mmu-).

""" error += HTML_FOOTER return None, None, error inputOrg = 'Mouse' else: # error if not a human or mouse miRname error = HTML_HEADER error += """""" error += """""" %miRname error += """

Input miRNA name, %s, must be either from human (start with 'hsa-') or mouse (start with 'mmu-').

""" error += HTML_FOOTER return None, None, error # open the appropriate miRBase file fn = PATH + org + '/miRBase_10.pickle' fh = open(fn, 'rb') miRseeds = cPickle.load(fh) fh.close() if not miRseeds.has_key(miRname): # error if unknown miRname error = HTML_HEADER error += """

Input miRNA name, %s, not found in human or mouse miRBase (release 10.0).

It is possible that a more specific name needs to be specified (e.g. hsa-mir-133a or hsa-mir-133b, instead of hsa-mir-133).

The genomic loci number is not required (e.g. hsa-mir-133a will return the same results as hsa-mir-133a-1).

""" %miRname error += HTML_FOOTER return None, None, error seqs.append(miRseeds[miRname]) return inputOrg, seqs, error def outputTargetsAsHTML(si=None, inputOrg=None, miRname=None): # handle an empty form - org will default to 'Human' if not si and not miRname: error = HTML_HEADER error += """""" error += """""" error += """

No input provided.

""" error += HTML_FOOTER return error # handle dual input case if si and miRname: error = HTML_HEADER error += """""" error += """""" error += """

Both a miRNA/siRNA sequence and a miRBase miRNA name were submitted. Please input just one or the other.

""" error += HTML_FOOTER return error # handle miRNA name input if not si and miRname: inputOrg, siSeqs, error = convertNameToSeqAndOrg(miRname) if error: return error elif si and not miRname: siSeqs = si.split(',') displayCount = 100 org = ORGs[inputOrg] if org == 'hg18': orgName = 'Homo+sapiens' elif org == 'mm8': orgName = 'Mus+musculus' targets, error = targetRank(siSeqs, org) if error: return error targetIDs = targets.keys() targetIDs.sort(lambda x,y: cmp(targets[x]['score'], targets[y]['score'])) targetIDs.reverse() s = HTML_HEADER # filter targets for a min score filteredTargetIDs = [] for id in targetIDs: if targets[id]['score'] < MIN_SCORE: continue filteredTargetIDs.append(id) # add a table for input description s += """""" if miRname: s += """""" %(inputOrg, miRname) else: s += """""" %(inputOrg, ','.join(siSeqs)) if len(filteredTargetIDs) < displayCount: s += """ \ """ %len(filteredTargetIDs) else: s += """ \ """ %(len(filteredTargetIDs), displayCount) # add a 'download all' button s += """ \

%s input miRNA name(s): %s
%s input sequence(s) (5' - 3' orientation): %s
%s targets found
%s targets found (top %s displayed)

""" %(','.join(siSeqs), inputOrg) # add a table for targets s += """ \ """ seedTypes = ['con_t1At8', 'con_t2t8', 'con_t1At7', 'con_t2t7', 'nc_t1At8', 'nc_t2t8', 'nc_t1At7', 'nc_t2t7'] sameRankCount = 1 rank = 1 prevScore = targets[filteredTargetIDs[0]]['score'] for i,id in enumerate(filteredTargetIDs[:100]): score = targets[id]['score'] if score != prevScore: rank += sameRankCount sameRankCount = 1 prevScore = targets[id]['score'] elif i != 0: sameRankCount += 1 firstID = id.split('|')[0] s += "" s += """""" %rank s += """""" %(targets[id]['name'],orgName, targets[id]['name']) s += "" %targets[id]['product'] s += "" %firstID s += """""" %targets[id]['score'] s += """""" %targets[id]['total'] s += """""" %targets[id]['con_t1At8'] s += """""" %targets[id]['con_t2t8'] s += """""" %targets[id]['con_t1At7'] s += """""" %targets[id]['con_t2t7'] s += """""" %targets[id]['nc_t1At8'] s += """""" %targets[id]['nc_t2t8'] s += """""" %targets[id]['nc_t1At7'] s += """""" %targets[id]['nc_t2t7'] s += "" s += """\

Rank

Gene Name

Gene/Isoform Description

Refseq ID

TargetRank Score

Seed Match Total

Conserved Seed Matches

Non-conserved Seed Matches

8mer

M8 7mer

A1 7mer

6mer

8mer

M8 7mer

A1 7mer

6mer

%.4f

""" error += """""" %nt error += """

Unknown letter: %s

Only 'A', 'C', 'G', 'T', and 'U' accepted.

""" error += HTML_FOOTER return None, error seq = seq.upper() # error if input sequence too short if len(seq) < 8: error = HTML_HEADER error += """""" error += """""" error += """

Input sequence too short (minimum of 8 letters).

""" error += HTML_FOOTER return None, error sRegion = seq[:8] smRegion = reverseComplement(sRegion) # print '\n', seq seedMatches = possibleSeedMatches(smRegion) for sm in seedMatches: allSeedMatches.setdefault(sm, []).append(seedMatches[sm]) targets = retrieveTargets(PATH, org, sm) if not targets: continue targetsBySM.setdefault(sm,{}) for id in targets: # add on seed match type info for i,match in enumerate(targets[id]): targets[id][i]['seedType'] = seedMatches[sm] targetsBySM[sm].setdefault(id, []).extend(targets[id]) # error if not seed matches found if not targetsBySM: error = HTML_HEADER error += """""" error += """No targets found""" error += """

""" error += HTML_FOOTER return None, error scoredTargets = scoreAllTargets(allSeedMatches, targetsBySM) return scoredTargets, None def run(): siSeqs, org = parseSiInput() s = outputTargetsAsText(siSeqs, org) print s run()