Project

General

Profile

Idea #13376 » generate_test_genome.py

Keldin Sergheyev, 04/26/2018 05:10 PM

 
#! /usr/bin/python3

# Purpose of this script is to take a vcf file as an input stream, and randomly
# unphase or delete positions in order to give something for phasing or
# imputation software to attempt to phase or impute. (Ultimately, that output
# gets graded for accuracy, thus benchmarking the software).
#
# While unphasing or deleting positions, a memo is kept of which positions
# are modified, so that you know which to grade later.
#
import os
import sys
import random

random.seed("Lily")

# Keep a record of the positions you've changed here
# Note that line number isn't worth tracking because beagle output will throw
# off the count.
#
fh = open("./changed_lines", "w")

linenumber = 0

# Strategy for getting GT from input vcf stream is to find occurence of final
# tab. The GT always appears after the final tab. So reverse string and find
# first tab.
#
def get_gt_from_vcf_line(line):
reversed = line[-1::-1]
# Now we look for first tab
first_tab = reversed.index("\t")
last_tab = len(line) - first_tab
gt = line[last_tab:last_tab+3]
return gt, last_tab

for line in sys.stdin:
linenumber += 1
line = line.strip()
if line[0] == "#":
print(line)
else:
gt, pos = get_gt_from_vcf_line(line)
# will trigger randomly about .5% of the time
# ensures that positions to be test imputed are chosen randomly
if random.randint(1,200) == 1:
fh.write(str(linenumber) + "\t" + str(pos) + "\t" + str(gt) + "\n")
gt = "./."

print(line[:pos] + gt + line[pos+3:])

fh.close()
(1-1/2)