Differences between revisions 1 and 2
Deletions are marked like this. | Additions are marked like this. |
Line 1: | Line 1: |
#acl ChrisSeidel:read,write,delete,revert All:read |
Solexa Eland Export format to WIG
Here is a python script which converts from Eland export output to a wig file:
#!/n/site/inst/Linux-i686/sys/bin/python # # import os, sys, random # if no args print usage message and quit if len(sys.argv) < 3: print 'usage: program chrlist elandfilename' sys.exit(0) # create output file name inputfile = sys.argv[2].rpartition('/')[2] if inputfile.endswith('.txt'): intuple = inputfile.rpartition('.') outfile = intuple[0] + ".wig" else: outfile = inputfile + ".wig" # get chromosome ids, UCSC names, sizes f=open(sys.argv[1], 'r') Chrs = [] chrnames = {} chrsize = {} for line in f: # skip comment lines if(line.startswith("#")): continue # remove newline, split by tab (chr,size,fa) = line.rstrip('\n').split('\t') Chrs.append(fa) chrnames[fa] = chr chrsize[fa] = size f.close() print "Chromosomes to search for:" for c in Chrs[:]: print c + " " print chrnames[c] # print len(Chrs) # create filenames # and open a bunch of files ChrFnames = [] Fps = {} i = 0 while i < len(Chrs): newFileName = Chrs[i] + "." + str(random.randint(1,1000)) + ".txt" Fps[Chrs[i]] = open(newFileName,'w') ChrFnames.append(newFileName) i = i + 1 # open input eland file f=open(sys.argv[2], 'r') # remember the begining position of file fbegin = f.tell() for c in Chrs[:]: f.seek(fbegin) # declare empty dictionary bases = {} print >>sys.stderr, "chromosome: " + c nlines = 0 for line in f: nlines += 1 L1 = line.split('\t') filtercode = L1[21].rstrip('\n') # skip reads that didn't pass filtering if filtercode == "N": continue # 10 is chr, 12 is base, 13 is strand if(L1[10] == c): # filter out U reads with more than 2 mm myset = ["A", "C", "G", "T", "N"] ctotal = 0 for s in myset: mycount = L1[14].count(s) ctotal += mycount if(ctotal > 2): # too many mm, skip to next sequence continue start = int(L1[12]) end = start + 36 if(end > int(chrsize[c])): end = int(chrsize[c]) for i in range(start, end): if(i in bases): bases[i] += 1 else: bases[i] = 1 # give some feedback if(not nlines % 10000): print >>sys.stderr, nlines # spit out results print >>Fps[c], "variableStep chrom=" + chrnames[c] # sort the dict and print it to file for mybase in sorted(bases.keys()): print >>Fps[c], "%s\t%d" % (mybase, bases[mybase]) # close file Fps[c].close() f.close() # create an output file to concatenate all the results # might want to test for presence of outfile, and if exists modify with random number # newFileName = "output" + str(random.randint(1,1000)) + ".wig" print >>sys.stderr, "creating output file: " + outfile fhout = open(outfile, 'w') # open each temp chr file # write results to output file # remove temp file for fname in ChrFnames[:]: f = open(fname,'r') for line in f: fhout.write(line) f.close() os.remove(fname) fhout.close()