Revision 1 as of 2009-07-31 22:35:11

Clear message

Solexa Eland Export format to WIG

Here is a python script which converts from Eland export output to a wig file:

#!/n/site/inst/Linux-i686/sys/bin/python
# 
#
import os, sys, random

# if no args print usage message and quit
if len(sys.argv) < 3:
    print 'usage: program chrlist elandfilename'
    sys.exit(0)


# create output file name
inputfile = sys.argv[2].rpartition('/')[2]

if inputfile.endswith('.txt'):
    intuple = inputfile.rpartition('.')
    outfile = intuple[0] + ".wig"
else:
    outfile = inputfile + ".wig"

# get chromosome ids, UCSC names, sizes
f=open(sys.argv[1], 'r')

Chrs = []
chrnames = {}
chrsize = {}
for line in f:
    # skip comment lines
    if(line.startswith("#")):
        continue
    # remove newline, split by tab
    (chr,size,fa) = line.rstrip('\n').split('\t')
    Chrs.append(fa)
    chrnames[fa] = chr
    chrsize[fa] = size
f.close()

print "Chromosomes to search for:"
for c in Chrs[:]:
    print c + " "
    print chrnames[c]

# print len(Chrs)

# create filenames
# and open a bunch of files
ChrFnames = []
Fps = {}
i = 0
while i < len(Chrs):
    newFileName = Chrs[i] + "." + str(random.randint(1,1000)) + ".txt"
    Fps[Chrs[i]] = open(newFileName,'w')
    ChrFnames.append(newFileName)
    i = i + 1

# open input eland file
f=open(sys.argv[2], 'r')

# remember the begining position of file
fbegin = f.tell()

for c in Chrs[:]:
    f.seek(fbegin)
    # declare empty dictionary
    bases = {}
    print >>sys.stderr, "chromosome: " + c
    nlines = 0
    for line in f:
        nlines += 1
        L1 = line.split('\t')
        filtercode = L1[21].rstrip('\n')
        # skip reads that didn't pass filtering
        if filtercode == "N":
            continue
        # 10 is chr, 12 is base, 13 is strand
        if(L1[10] == c):
            # filter out U reads with more than 2 mm
            myset = ["A", "C", "G", "T", "N"]
            ctotal = 0
            for s in myset:
                mycount = L1[14].count(s)
                ctotal += mycount
            if(ctotal > 2):
                # too many mm, skip to next sequence
                continue
            start = int(L1[12])
            end = start + 36
            if(end > int(chrsize[c])):
                end = int(chrsize[c])
            for i in range(start, end):
                if(i in bases):
                    bases[i] += 1
                else:
                    bases[i] = 1
        # give some feedback
        if(not nlines % 10000):
            print >>sys.stderr, nlines

    # spit out results
    print >>Fps[c], "variableStep chrom=" + chrnames[c]

    # sort the dict and print it to file
    for mybase in sorted(bases.keys()):
        print >>Fps[c], "%s\t%d" % (mybase, bases[mybase])
    # close file
    Fps[c].close()
f.close()

# create an output file to concatenate all the results
# might want to test for presence of outfile, and if exists modify with random number
# newFileName = "output" + str(random.randint(1,1000)) + ".wig"

print >>sys.stderr, "creating output file: " + outfile

fhout = open(outfile, 'w')

# open each temp chr file
# write results to output file
# remove temp file
for fname in ChrFnames[:]:
    f = open(fname,'r')
    for line in f:
        fhout.write(line)
    f.close()
    os.remove(fname)

fhout.close()