-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path450_calculate_df_gcgb_sample.py
executable file
·30 lines (24 loc) · 1.12 KB
/
450_calculate_df_gcgb_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
import argparse
from tqdm import tqdm
# Define and parse command-line arguments
parser = argparse.ArgumentParser(description='Group and calculate defense statistics.')
parser.add_argument('-i', '--input', type=str, required=True, help='Path to input CSV file')
parser.add_argument('-o', '--output', type=str, required=True, help='Path to output CSV file')
args = parser.parse_args()
# Read the input file
print(f"Reading input file: {args.input}")
df = pd.read_csv(args.input)
# Group and aggregate the data
print("Grouping and aggregating data...")
grouped = df.groupby(['Contig_Classification', 'Country', 'Location', 'Sample']).agg({
'Defense_Number': 'sum',
'Contig_Length': 'sum'
}).reset_index()
# Calculate the final result
print("Calculating GCGB...")
grouped['GCGB'] = grouped['Defense_Number'] * 1000000 / grouped['Contig_Length']
# Export the result
print(f"Exporting results to: {args.output}")
grouped.to_csv(args.output, index=False, columns=['Contig_Classification', 'Country', 'Location', 'Sample', 'Defense_Number', 'Contig_Length', 'GCGB'])
print(f"Results have been exported to {args.output}")