Project

General

Profile

Idea #13376 » generate_test_genome.py

Keldin Sergheyev, 04/26/2018 05:10 PM

 
1
#! /usr/bin/python3
2

    
3
# Purpose of this script is to take a vcf file as an input stream, and randomly
4
# unphase or delete positions in order to give something for phasing or
5
# imputation software to attempt to phase or impute. (Ultimately, that output
6
# gets graded for accuracy, thus benchmarking the software).
7
#
8
# While unphasing or deleting positions, a memo is kept of which positions
9
# are modified, so that you know which to grade later.
10
#
11
import os
12
import sys
13
import random
14

    
15
random.seed("Lily")
16

    
17
# Keep a record of the positions you've changed here
18
# Note that line number isn't worth tracking because beagle output will throw
19
# off the count.
20
#
21
fh = open("./changed_lines", "w")
22

    
23
linenumber = 0
24

    
25
# Strategy for getting GT from input vcf stream is to find occurence of final
26
# tab. The GT always appears after the final tab. So reverse string and find 
27
# first tab.
28
#
29
def get_gt_from_vcf_line(line):
30
    reversed = line[-1::-1]
31
    # Now we look for first tab
32
    first_tab = reversed.index("\t")
33
    last_tab = len(line) - first_tab
34
    gt = line[last_tab:last_tab+3]
35
    return gt, last_tab
36

    
37
for line in sys.stdin:
38
    linenumber += 1
39
    line = line.strip()
40
    if line[0] == "#":
41
        print(line)
42
    else:
43
        gt, pos = get_gt_from_vcf_line(line)
44
        # will trigger randomly about .5% of the time
45
        # ensures that positions to be test imputed are chosen randomly
46
        if random.randint(1,200) == 1:
47
            fh.write(str(linenumber) + "\t" + str(pos) + "\t" + str(gt) + "\n")
48
            gt = "./."
49

    
50
        print(line[:pos] + gt + line[pos+3:])
51

    
52
fh.close()
(1-1/2)