-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_data_sources.py
102 lines (76 loc) · 2.7 KB
/
csv_data_sources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import os
from datetime import datetime
import numpy as np
def get_column_type(column):
"""Determine the data type of a column"""
# Check if column is entirely null
if column.isnull().all():
return "NULL"
# Get non-null values
sample = column.dropna()
if len(sample) == 0:
return "NULL"
# Check if dates
if pd.api.types.is_datetime64_any_dtype(sample):
return "date"
# Try parsing as date if string
if sample.dtype == object:
try:
pd.to_datetime(sample.iloc[0])
return "date"
except:
pass
# Check numeric types
if np.issubdtype(sample.dtype, np.integer):
return "integer"
elif np.issubdtype(sample.dtype, np.floating):
return "float"
elif sample.dtype == bool:
return "boolean"
# Check if comma-separated list
if sample.dtype == object and sample.iloc[0] and ',' in str(sample.iloc[0]):
return "list"
# Default to string for object type
return "string"
def analyze_csv_datatypes(folder_path="data"):
"""Analyze data types for all CSV files in the specified folder"""
results = {}
# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
for file in csv_files:
print(f"\nAnalyzing {file}...")
# Read CSV file
df = pd.read_csv(os.path.join(folder_path, file))
# Get data types for each column
file_types = {}
for column in df.columns:
dtype = get_column_type(df[column])
file_types[column] = dtype
results[file] = file_types
return results
def format_results(results):
"""Format results into a readable string"""
output = []
for file, columns in results.items():
output.append(f"\n{file} datatypes:")
output.append("-" * 50)
# Get max column name length for alignment
max_len = max(len(col) for col in columns.keys())
for column, dtype in columns.items():
# Pad column name for alignment
padded_col = column.ljust(max_len)
output.append(f"{padded_col} : {dtype}")
return "\n".join(output)
def main():
# Analyze all CSV files
results = analyze_csv_datatypes()
# Format and print results
formatted_output = format_results(results)
print(formatted_output)
# Save results to file
with open('data_types.txt', 'w') as f:
f.write(formatted_output)
print("\nResults have been saved to data_types.txt")
if __name__ == "__main__":
main()