|
#! /usr/bin/python3
|
|
|
|
# Purpose of this script is to take a vcf file as an input stream, and randomly
|
|
# unphase or delete positions in order to give something for phasing or
|
|
# imputation software to attempt to phase or impute. (Ultimately, that output
|
|
# gets graded for accuracy, thus benchmarking the software).
|
|
#
|
|
# While unphasing or deleting positions, a memo is kept of which positions
|
|
# are modified, so that you know which to grade later.
|
|
#
|
|
import os
|
|
import sys
|
|
import random
|
|
|
|
random.seed("Lily")
|
|
|
|
# Keep a record of the positions you've changed here
|
|
# Note that line number isn't worth tracking because beagle output will throw
|
|
# off the count.
|
|
#
|
|
fh = open("./changed_lines", "w")
|
|
|
|
linenumber = 0
|
|
|
|
# Strategy for getting GT from input vcf stream is to find occurence of final
|
|
# tab. The GT always appears after the final tab. So reverse string and find
|
|
# first tab.
|
|
#
|
|
def get_gt_from_vcf_line(line):
|
|
reversed = line[-1::-1]
|
|
# Now we look for first tab
|
|
first_tab = reversed.index("\t")
|
|
last_tab = len(line) - first_tab
|
|
gt = line[last_tab:last_tab+3]
|
|
return gt, last_tab
|
|
|
|
for line in sys.stdin:
|
|
linenumber += 1
|
|
line = line.strip()
|
|
if line[0] == "#":
|
|
print(line)
|
|
else:
|
|
gt, pos = get_gt_from_vcf_line(line)
|
|
# will trigger randomly about .5% of the time
|
|
# ensures that positions to be test imputed are chosen randomly
|
|
if random.randint(1,200) == 1:
|
|
fh.write(str(linenumber) + "\t" + str(pos) + "\t" + str(gt) + "\n")
|
|
gt = "./."
|
|
|
|
print(line[:pos] + gt + line[pos+3:])
|
|
|
|
fh.close()
|