-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_chunks.py
55 lines (45 loc) · 1.39 KB
/
merge_chunks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from tqdm import tqdm
import numpy as np
import random
from collections import Counter
import time
from math import log10
from scipy import stats
from statistics import mean
from collections import defaultdict
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("disease", help="ex: anthropometric", default=None)
args = parser.parse_args()
disease = args.disease
PATH = "final_permutation_testing_results"
if not os.path.exists(PATH):
os.makedirs(PATH)
overall_results = pd.DataFrame()
for i in range(1000, 100001, 1000):
temp = pd.read_csv(
"%s_permutation_testing_results_chunks/%s_enrichment_after_permutations_%s.bed"
% (disease, disease, i),
sep="\t",
header=0,
)
overall_results = pd.concat([overall_results, temp])
print(overall_results.shape[0])
overall_results["shuffled_set_id_int"] = overall_results["shuffled_set_id"].str.split("_").str[-1]
overall_results["shuffled_set_id_int"] = overall_results["shuffled_set_id_int"].astype(
int
)
overall_results = overall_results.sort_values("shuffled_set_id_int")
overall_results = overall_results[["shuffled_set_id", "enrichment_ratio", "Disease"]]
overall_results.to_csv(
"%s/%s_enrichment_after_permutations.bed"
% (PATH, disease),
sep="\t",
header=True,
index=False,
)