-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpqs_closest_to_gene.py
72 lines (49 loc) · 2.27 KB
/
pqs_closest_to_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 09 14:00:01 2021
@author: Ruth De Paula and Luciano Branco
"""
import pandas as pd
import numpy as np
import math
import numpy.matlib
import sys
# # arg1 = sys.argv[1]
arg1 = '500_first_splice_g4_counts_3_100.txt' # input file with counts
file_toPlot = pd.read_csv(arg1, delimiter = '\t', names = ['strand', 'gene', 'ss_chr', 'ss_coord', 'pqs_chr', 'pqs_coord', 'distance'])
#sort according to pqs_chr and pqs_coord
sorted_df_old_index = file_toPlot.sort_values(["pqs_chr", "pqs_coord"])
sorted_df = sorted_df_old_index.reset_index(drop=True)
#if sorting numpy:
#ind = np.lexsort((file_toPlot["pqs_chr"], file_toPlot["pqs_coord"]))
#sorted_df = file_toPlot[ind]
# start empty dataframe
output_dataframe = pd.DataFrame(columns = sorted_df.columns.tolist())
line_idx = 0
#for line_idx in range(0, len(sorted_df.index) -1 ):
while line_idx < len(sorted_df.index)-1:
if (sorted_df.pqs_chr[line_idx] == sorted_df.pqs_chr[line_idx+1]):
# Same pqs_chr here.
if (sorted_df.pqs_coord[line_idx] == sorted_df.pqs_coord[line_idx+1]):
# Same pqs_coord here. Find the closest to splice site.
if (abs(sorted_df.ss_coord[line_idx] - sorted_df.pqs_coord[line_idx])
<
abs(sorted_df.ss_coord[line_idx+1] - sorted_df.pqs_coord[line_idx+1])
):
# Save line_idx
line_contents = sorted_df[line_idx : line_idx+1]
else:
# Save line_idx + 1
line_contents = sorted_df[line_idx+1 : line_idx+2]
else: # pqs_coord are different.
# Save both lines
line_contents = sorted_df[line_idx : line_idx+2]
output_dataframe = output_dataframe.append(line_contents)
# Jump one line because we already looked at 2 lines !
line_idx = line_idx+2
if len(sorted_df.index) % 2 != 0:
line_contents = sorted_df[line_idx : line_idx+1]
output_dataframe = output_dataframe.append(line_contents)
# Prep output file:
output_filename = '500_first_splice_g4_counts_3_100_output.txt'
output_dataframe.to_csv(output_filename, sep='\t', index=False)