Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FT] Add XLSX data source #38

Merged
merged 1 commit into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/certificates.xlsx
Binary file not shown.
20 changes: 10 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/data_sources/csv_data_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from typing import Optional, List, Any
from typing import Optional, List


class CSVDataSource:
Expand Down
93 changes: 93 additions & 0 deletions src/data_sources/xlsx_data_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
from typing import Optional, List


class XLSXDataSource:
"""
XLSX DataSource class to manage data retrieval from Excel (.xlsx) files.

Attributes:
file_path (str): Path to the Excel file.
sheet_name (str): Name of the sheet to load from the Excel file.
dataframe (Optional[pd.DataFrame]): Loaded data as a pandas DataFrame.
"""

def __init__(self, file_path: str, sheet_name: str):
self.file_path = file_path
self.sheet_name = sheet_name
self.dataframe: Optional[pd.DataFrame] = None
self.__load_data()

def __load_data(self) -> None:
"""
Load data from the Excel file into a pandas DataFrame.
"""
self.dataframe = pd.read_excel(self.file_path, sheet_name=self.sheet_name)

@property
def file_path(self) -> str:
return self.__file_path

@file_path.setter
def file_path(self, file_path: str) -> None:
if not isinstance(file_path, str):
raise TypeError("'file_path' must be a string.")
if not file_path.strip():
raise ValueError("'file_path' cannot be an empty string.")
self.__file_path = file_path

@property
def sheet_name(self) -> str:
return self.__sheet_name

@sheet_name.setter
def sheet_name(self, sheet_name: str) -> None:
if not isinstance(sheet_name, str):
raise TypeError("'sheet_name' must be a string.")
self.__sheet_name = sheet_name

def fetch_data(self) -> pd.DataFrame:
"""
Fetch all data from the Excel sheet as a pandas DataFrame.

Returns:
pd.DataFrame: Data loaded from the Excel sheet.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded from the Excel file.")
return self.dataframe

def get_columns(self) -> List[str]:
"""
Get the list of column names from the Excel data.

Returns:
List[str]: Column names.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded from the Excel file.")
return list(self.dataframe.columns)

def filter_data(self, query: str) -> pd.DataFrame:
"""
Filter the Excel data using a pandas query string.

Args:
query (str): Query string to filter the data.

Returns:
pd.DataFrame: Filtered data based on the query.

Raises:
RuntimeError: If no data is loaded.
ValueError: If the query is invalid.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded from the Excel file.")

return self.dataframe.query(query)

def __str__(self) -> str:
return (
f"XLSXDataSource(file_path={self.file_path}, sheet_name={self.sheet_name})"
)
23 changes: 9 additions & 14 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import os
from models.template import Template
from services.template_manager import TemplateManager
from services.s3_delivery import AWSConfig, S3Delivery
from data_sources.csv_data_source import CSVDataSource
from data_sources.xlsx_data_source import XLSXDataSource


def main():

csv_data_source = CSVDataSource(file_path="data/certificates.csv", delimiter=";")
xlsx_data_source = XLSXDataSource(
file_path="data/certificates.xlsx", sheet_name="Sheet1"
)

full_data = csv_data_source.fetch_data()
print("Datos completos cargados desde el CSV:")
full_data = xlsx_data_source.fetch_data()
print("Datos completos cargados desde el XLSX:")
print(full_data.head())

columns = csv_data_source.get_columns()
print(f"Columnas encontradas en el CSV: {columns}")
columns = xlsx_data_source.get_columns()
print(f"Columnas encontradas en el XLSX: {columns}")

filtered_data = csv_data_source.filter_data('lang == "es"')
filtered_data = xlsx_data_source.filter_data('lang == "es"')
print(f"Datos filtrados (solo español):")
print(filtered_data)

Expand Down Expand Up @@ -46,12 +47,6 @@ def main():
for file in mytuple:
print(file)

print("Subiendo archivos a S3...")
S3Delivery(AWSConfig.from_profile()).upload_many_files(
files=mytuple,
bucket_name="",
)


if __name__ == "__main__":
main()
Loading