diff --git a/docs/mint.json b/docs/mint.json index 5ed0f6d6b..079defb7e 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -59,7 +59,7 @@ }, { "group": "Data", - "pages": ["v3/data-layer", "v3/semantic-layer", "v3/data-ingestion", "v3/dataframes"], + "pages": ["v3/data-layer", "v3/semantic-layer", "v3/data-ingestion", "v3/transformations", "v3/dataframes"], "version": "v3" }, { diff --git a/docs/v3/transformations.mdx b/docs/v3/transformations.mdx new file mode 100644 index 000000000..76e6af502 --- /dev/null +++ b/docs/v3/transformations.mdx @@ -0,0 +1,416 @@ +--- +title: 'Data Transformations' +description: 'Available data transformations in PandaAI' +--- + + +Release v3 is currently in beta. This documentation reflects the features and functionality in progress and may change before the final release. + + +## Data Transformations in PandaAI + +PandaAI provides a rich set of data transformations that can be applied to your data. These transformations can be specified in your schema file or applied programmatically. + +### String Transformations + +```yaml +transformations: + # Convert text to lowercase + - type: to_lowercase + params: + column: product_name + + # Convert text to uppercase + - type: to_uppercase + params: + column: category + + # Remove leading/trailing whitespace + - type: strip + params: + column: description + + # Truncate text to specific length + - type: truncate + params: + column: description + length: 100 + add_ellipsis: true # Optional, adds "..." to truncated text + + # Pad strings to fixed width + - type: pad + params: + column: product_code + width: 10 + side: left # Optional: "left" or "right", default "left" + pad_char: "0" # Optional, default " " + + # Extract text using regex + - type: extract + params: + column: product_code + pattern: "^[A-Z]+-(\d+)" # Extracts numbers after hyphen +``` + +### Numeric Transformations + +```yaml +transformations: + # Round numbers to specified decimals + - type: round_numbers + params: + column: price + decimals: 2 + + # Scale values by a factor + - type: scale + params: + column: price + factor: 1.1 # 10% increase + + # Clip values to bounds + - type: clip + params: + column: quantity + lower: 0 # Optional + upper: 100 # Optional + + # Normalize to 0-1 range + - type: normalize + params: + column: score + + # Standardize using z-score + - type: standardize + params: + column: score + + # Ensure positive values + - type: ensure_positive + params: + column: amount + drop_negative: false # Optional, drops rows with negative values if true + + # Bin continuous data + - type: bin + params: + column: age + bins: [0, 18, 35, 50, 65, 100] # Or specify number of bins: bins: 5 + labels: ["0-18", "19-35", "36-50", "51-65", "65+"] # Optional +``` + +### Date and Time Transformations + +```yaml +transformations: + # Convert timezone + - type: convert_timezone + params: + column: timestamp + to: "US/Pacific" + + # Format dates + - type: format_date + params: + column: date + format: "%Y-%m-%d" + + # Convert to datetime + - type: to_datetime + params: + column: date + format: "%Y-%m-%d" # Optional + errors: "coerce" # Optional: "raise", "coerce", or "ignore" + + # Validate date range + - type: validate_date_range + params: + column: date + start_date: "2024-01-01" + end_date: "2024-12-31" + drop_invalid: false # Optional +``` + +### Data Cleaning Transformations + +```yaml +transformations: + # Fill missing values + - type: fill_na + params: + column: quantity + value: 0 + + # Replace values + - type: replace + params: + column: status + old_value: "inactive" + new_value: "disabled" + + # Remove duplicates + - type: remove_duplicates + params: + columns: ["order_id", "product_id"] + keep: "first" # Optional: "first", "last", or false + + # Normalize phone numbers + - type: normalize_phone + params: + column: phone + country_code: "+1" # Optional, default "+1" +``` + +### Categorical Transformations + +```yaml +transformations: + # One-hot encode categories + - type: encode_categorical + params: + column: category + drop_first: true # Optional + + # Map values using dictionary + - type: map_values + params: + column: grade + mapping: + "A": 4.0 + "B": 3.0 + "C": 2.0 + + # Standardize categories + - type: standardize_categories + params: + column: company + mapping: + "Apple Inc.": "Apple" + "Apple Computer": "Apple" +``` + +### Validation Transformations + +```yaml +transformations: + # Validate email format + - type: validate_email + params: + column: email + drop_invalid: false # Optional + + # Validate foreign key references + - type: validate_foreign_key + params: + column: user_id + ref_df: users # Reference DataFrame + ref_column: id + drop_invalid: false # Optional +``` + +### Privacy and Security Transformations + +```yaml +transformations: + # Anonymize sensitive data + - type: anonymize + params: + column: email # Replaces username in emails with asterisks +``` + +## Type Conversion Transformations + +```yaml +transformations: + # Convert to numeric type + - type: to_numeric + params: + column: amount + errors: "coerce" # Optional: "raise", "coerce", or "ignore" +``` + +## Chaining Transformations + +You can chain multiple transformations in sequence. The transformations will be applied in the order they are specified: + +```yaml +transformations: + - type: to_lowercase + params: + column: product_name + - type: strip + params: + column: product_name + - type: truncate + params: + column: product_name + length: 50 +``` + +## Programmatic Usage + +While schema files are convenient for static transformations, you can also apply transformations programmatically using the `TransformationManager`: + +```python +import pandasai as pai + +df = pai.read_csv("data.csv") +manager = TransformationManager(df) +result = (manager + .validate_email("email", drop_invalid=True) + .normalize_phone("phone") + .validate_date_range("birth_date", "1900-01-01", "2024-01-01") + .remove_duplicates("user_id") + .ensure_positive("amount") + .standardize_categories("company", {"Apple Inc.": "Apple"}) + .df) +``` + +This approach allows for a fluent interface, chaining multiple transformations together. Each method returns the manager instance, enabling further transformations. The final `.df` attribute returns the transformed DataFrame. + +## Complete Example + +Let's walk through a complete example of data transformation using a sales dataset. This example demonstrates how to clean, validate, and prepare your data for analysis. + +### Sample Data + +Consider a CSV file `sales_data.csv` with the following structure: +```csv +date,store_id,product_name,category,quantity,unit_price,customer_email +2024-01-15, ST001, iPhone 13 Pro,Electronics,2,999.99,john.doe@email.com +2024-01-15,ST002,macBook Pro ,Electronics,-1,1299.99,invalid.email +2024-01-16,ST001,AirPods Pro,Electronics,3,249.99,jane@example.com +2024-01-16,ST003,iMac 27" ,Electronics,1,1799.99, +``` + +### Schema File + +Create a `schema.yaml` file to define the transformations: + +```yaml +name: sales_data +description: "Daily sales data from retail stores" +source: + type: csv + path: "sales_data.csv" + +transformations: + # Clean up product names + - type: strip + params: + column: product_name + - type: standardize_categories + params: + column: product_name + mapping: + "iPhone 13 Pro": "iPhone 13 Pro" + "macBook Pro": "MacBook Pro" + "AirPods Pro": "AirPods Pro" + "iMac 27\"": "iMac 27-inch" + + # Format dates + - type: to_datetime + params: + column: date + format: "%Y-%m-%d" + + # Validate and clean store IDs + - type: pad + params: + column: store_id + width: 5 + side: "right" + pad_char: "0" + + # Ensure valid quantities + - type: ensure_positive + params: + column: quantity + drop_negative: true + + # Format prices + - type: round_numbers + params: + column: unit_price + decimals: 2 + + # Validate emails + - type: validate_email + params: + column: customer_email + drop_invalid: false + + # Add derived columns + - type: scale + params: + column: unit_price + factor: 1.1 # Add 10% tax + +columns: + date: + type: datetime + description: "Date of sale" + store_id: + type: string + description: "Store identifier" + product_name: + type: string + description: "Product name" + category: + type: string + description: "Product category" + quantity: + type: integer + description: "Number of units sold" + unit_price: + type: float + description: "Price per unit" + customer_email: + type: string + description: "Customer email address" +``` + +### Python Code + +Here's how to use the schema and transformations in your code: + +```python +import pandasai as pai + +# Load and transform the data of the schema we just created +df = pai.load("my-org/sales-data") + +# The resulting DataFrame will have: +# - Cleaned and standardized product names +# - Properly formatted dates +# - Padded store IDs (e.g., "ST001000") +# - Only positive quantities +# - Rounded prices with tax +# - Validated email addresses + +# You can now analyze the data +response = df.chat("What's our best-selling product?") + +# Or export the transformed data +df.to_csv("cleaned_sales_data.csv") +``` + +### Result + +The transformed data will look like this: +```csv +date,store_id,product_name,category,quantity,unit_price,customer_email,email_valid +2024-01-15,ST001000,iPhone 13 Pro,Electronics,2,1099.99,john.doe@email.com,true +2024-01-16,ST001000,AirPods Pro,Electronics,3,274.99,jane@example.com,true +2024-01-16,ST003000,iMac 27-inch,Electronics,1,1979.99,,false +``` + +Notice how the transformations have: +- Standardized product names +- Padded store IDs +- Removed negative quantity rows +- Added 10% tax to prices +- Validated email addresses +- Added an email validation column + +This example demonstrates how to use multiple transformations together to clean and prepare your data for analysis. The transformations are applied in sequence, and each transformation builds on the results of the previous ones. diff --git a/pandasai/data_loader/transformation_manager.py b/pandasai/data_loader/transformation_manager.py index 7e4aa710e..aa12290dd 100644 --- a/pandasai/data_loader/transformation_manager.py +++ b/pandasai/data_loader/transformation_manager.py @@ -1,5 +1,5 @@ -from typing import List, Optional - +from typing import List, Optional, Union, Any +import numpy as np import pandas as pd from ..exceptions import UnsupportedTransformation @@ -28,7 +28,7 @@ def _anonymize(self, value: str) -> str: """ if pd.isna(value): return value - + value_str = str(value) if "@" in value_str: username, domain = value_str.split("@", 1) @@ -43,13 +43,20 @@ def anonymize(self, column: str) -> "TransformationManager": Returns: TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"email": ["user@example.com", "another@domain.com"]}) + >>> manager = TransformationManager(df) + >>> result = manager.anonymize("email").df + >>> print(result) + email + 0 ****@example.com + 1 *******@domain.com """ self.df[column] = self.df[column].apply(self._anonymize) return self - def convert_timezone( - self, column: str, to_timezone: str - ) -> "TransformationManager": + def convert_timezone(self, column: str, to_timezone: str) -> "TransformationManager": """Convert timezone for datetime column. Args: @@ -58,13 +65,692 @@ def convert_timezone( Returns: TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"timestamp": ["2024-01-01 12:00:00+00:00"]}) + >>> manager = TransformationManager(df) + >>> result = manager.convert_timezone("timestamp", "US/Pacific").df + >>> print(result) + timestamp + 0 2024-01-01 04:00:00-08:00 """ self.df[column] = pd.to_datetime(self.df[column]).dt.tz_convert(to_timezone) return self - def apply_transformations( - self, transformations: Optional[List[dict]] = None - ) -> pd.DataFrame: + def to_lowercase(self, column: str) -> "TransformationManager": + """Convert all values in a column to lowercase. + + Args: + column (str): The column to transform + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"text": ["Hello", "WORLD"]}) + >>> manager = TransformationManager(df) + >>> result = manager.to_lowercase("text").df + >>> print(result) + text + 0 hello + 1 world + """ + self.df[column] = self.df[column].str.lower() + return self + + def to_uppercase(self, column: str) -> "TransformationManager": + """Convert all values in a column to uppercase. + + Args: + column (str): The column to transform + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"text": ["Hello", "world"]}) + >>> manager = TransformationManager(df) + >>> result = manager.to_uppercase("text").df + >>> print(result) + text + 0 HELLO + 1 WORLD + """ + self.df[column] = self.df[column].str.upper() + return self + + def strip(self, column: str) -> "TransformationManager": + """Remove leading and trailing whitespace from values in a column. + + Args: + column (str): The column to transform + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"text": [" Hello ", " World "]}) + >>> manager = TransformationManager(df) + >>> result = manager.strip("text").df + >>> print(result) + text + 0 Hello + 1 World + """ + self.df[column] = self.df[column].str.strip() + return self + + def round_numbers(self, column: str, decimals: int) -> "TransformationManager": + """Round numeric values in a column to specified decimal places. + + Args: + column (str): The column to transform + decimals (int): Number of decimal places to round to + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"price": [10.126, 20.983]}) + >>> manager = TransformationManager(df) + >>> result = manager.round_numbers("price", 2).df + >>> print(result) + price + 0 10.13 + 1 20.98 + """ + self.df[column] = self.df[column].round(decimals) + return self + + def scale(self, column: str, factor: float) -> "TransformationManager": + """Multiply values in a column by a scaling factor. + + Args: + column (str): The column to transform + factor (float): The scaling factor to multiply by + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"price": [10, 20]}) + >>> manager = TransformationManager(df) + >>> result = manager.scale("price", 1.1).df # 10% increase + >>> print(result) + price + 0 11.0 + 1 22.0 + """ + self.df[column] = self.df[column] * factor + return self + + def format_date(self, column: str, date_format: str) -> "TransformationManager": + """Format datetime values in a column according to the specified format. + + Args: + column (str): The column to transform + date_format (str): The desired date format (e.g., "%Y-%m-%d") + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"date": ["2024-01-01 12:30:45"]}) + >>> manager = TransformationManager(df) + >>> result = manager.format_date("date", "%Y-%m-%d").df + >>> print(result) + date + 0 2024-01-01 + """ + self.df[column] = self.df[column].dt.strftime(date_format) + return self + + def to_numeric(self, column: str, errors: str = "coerce") -> "TransformationManager": + """Convert values in a column to numeric type. + + Args: + column (str): The column to transform + errors (str): How to handle parsing errors: + 'raise': raise an exception + 'coerce': set errors to NaN + 'ignore': return the input + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"value": ["1.23", "4.56", "invalid"]}) + >>> manager = TransformationManager(df) + >>> result = manager.to_numeric("value", errors="coerce").df + >>> print(result) + value + 0 1.23 + 1 4.56 + 2 NaN + """ + self.df[column] = pd.to_numeric(self.df[column], errors=errors) + return self + + def to_datetime(self, column: str, format: Optional[str] = None, errors: str = "coerce") -> "TransformationManager": + """Convert values in a column to datetime type. + + Args: + column (str): The column to transform + format (Optional[str]): Expected date format of the input + errors (str): How to handle parsing errors + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"date": ["2024-01-01", "invalid"]}) + >>> manager = TransformationManager(df) + >>> result = manager.to_datetime("date", errors="coerce").df + >>> print(result) + date + 0 2024-01-01 + 1 NaT + """ + self.df[column] = pd.to_datetime(self.df[column], format=format, errors=errors) + return self + + def fill_na(self, column: str, value: Any) -> "TransformationManager": + """Fill NA/NaN values in a column with the specified value. + + Args: + column (str): The column to transform + value (Any): Value to use to fill NA/NaN + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"value": [1, None, 3]}) + >>> manager = TransformationManager(df) + >>> result = manager.fill_na("value", 0).df + >>> print(result) + value + 0 1 + 1 0 + 2 3 + """ + self.df[column] = self.df[column].fillna(value) + return self + + def replace(self, column: str, old_value: str, new_value: str) -> "TransformationManager": + """Replace occurrences of old_value with new_value in the column. + + Args: + column (str): The column to transform + old_value (str): Value to replace + new_value (str): Replacement value + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"status": ["active", "inactive", "active"]}) + >>> manager = TransformationManager(df) + >>> result = manager.replace("status", "inactive", "disabled").df + >>> print(result) + status + 0 active + 1 disabled + 2 active + """ + self.df[column] = self.df[column].str.replace(old_value, new_value) + return self + + def extract(self, column: str, pattern: str) -> "TransformationManager": + """Extract text matching the regex pattern. + + Args: + column (str): The column to transform + pattern (str): Regular expression pattern to extract + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"text": ["ID: 123", "ID: 456"]}) + >>> manager = TransformationManager(df) + >>> result = manager.extract("text", r"ID: (\d+)").df + >>> print(result) + text + 0 123 + 1 456 + """ + self.df[column] = self.df[column].str.extract(pattern, expand=False) + return self + + def truncate(self, column: str, length: int, add_ellipsis: bool = True) -> "TransformationManager": + """Truncate strings to specified length. + + Args: + column (str): The column to transform + length (int): Maximum length of string + add_ellipsis (bool): Whether to add "..." to truncated strings + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"text": ["very long text", "short"]}) + >>> manager = TransformationManager(df) + >>> result = manager.truncate("text", 8, add_ellipsis=True).df + >>> print(result) + text + 0 very... + 1 short + """ + def _truncate(x): + if pd.isna(x): + return x + s = str(x) + if len(s) <= length: + return s + if add_ellipsis: + # Reserve 3 characters for ellipsis + return s[:length-3].rstrip() + "..." + return s[:length] + + self.df[column] = self.df[column].apply(_truncate) + return self + + def pad(self, column: str, width: int, side: str = "left", pad_char: str = " ") -> "TransformationManager": + """Pad strings to a specified width. + + Args: + column (str): The column to transform + width (int): Desired string width + side (str): Side to pad ('left' or 'right') + pad_char (str): Character to use for padding + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"id": ["1", "23"]}) + >>> manager = TransformationManager(df) + >>> result = manager.pad("id", width=3, side="left", pad_char="0").df + >>> print(result) + id + 0 001 + 1 023 + """ + if side == "left": + self.df[column] = self.df[column].str.rjust(width, pad_char) + else: + self.df[column] = self.df[column].str.ljust(width, pad_char) + return self + + def clip(self, column: str, lower: Optional[float] = None, upper: Optional[float] = None) -> "TransformationManager": + """Clip values to be between lower and upper bounds. + + Args: + column (str): The column to transform + lower (Optional[float]): Lower bound + upper (Optional[float]): Upper bound + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"score": [0, 50, 100, 150]}) + >>> manager = TransformationManager(df) + >>> result = manager.clip("score", lower=0, upper=100).df + >>> print(result) + score + 0 0 + 1 50 + 2 100 + 3 100 + """ + self.df[column] = self.df[column].clip(lower=lower, upper=upper) + return self + + def bin(self, column: str, bins: Union[int, List[float]], labels: Optional[List[str]] = None) -> "TransformationManager": + """Bin continuous data into discrete intervals. + + Args: + column (str): The column to transform + bins (Union[int, List[float]]): Number of bins or bin edges + labels (Optional[List[str]]): Labels for the bins + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"age": [25, 35, 45, 55]}) + >>> manager = TransformationManager(df) + >>> result = manager.bin("age", bins=[0, 30, 50, 100], + ... labels=["Young", "Middle", "Senior"]).df + >>> print(result) + age + 0 Young + 1 Middle + 2 Middle + 3 Senior + """ + self.df[column] = pd.cut(self.df[column], bins=bins, labels=labels) + return self + + def normalize(self, column: str) -> "TransformationManager": + """Normalize values to 0-1 range using min-max scaling. + + Args: + column (str): The column to transform + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"score": [0, 50, 100]}) + >>> manager = TransformationManager(df) + >>> result = manager.normalize("score").df + >>> print(result) + score + 0 0.0 + 1 0.5 + 2 1.0 + """ + min_val = self.df[column].min() + max_val = self.df[column].max() + self.df[column] = (self.df[column] - min_val) / (max_val - min_val) + return self + + def standardize(self, column: str) -> "TransformationManager": + """Standardize values using z-score normalization. + + Args: + column (str): The column to transform + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"score": [1, 2, 3]}) + >>> manager = TransformationManager(df) + >>> result = manager.standardize("score").df + >>> print(result) + score + 0 -1.224745 + 1 0.000000 + 2 1.224745 + """ + mean = self.df[column].mean() + std = self.df[column].std() + self.df[column] = (self.df[column] - mean) / std + return self + + def map_values(self, column: str, mapping: dict) -> "TransformationManager": + """Map values to new values using a dictionary. + + Args: + column (str): The column to transform + mapping (dict): Dictionary mapping old values to new values + e.g., {"Apple Inc.": "Apple", "Apple Computer": "Apple"} + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"grade": ["A", "B", "C"]}) + >>> mapping = {"A": 4.0, "B": 3.0, "C": 2.0} + >>> manager = TransformationManager(df) + >>> result = manager.map_values("grade", mapping).df + >>> print(result) + grade + 0 4.0 + 1 3.0 + 2 2.0 + """ + self.df[column] = self.df[column].map(mapping) + return self + + def encode_categorical(self, column: str, drop_first: bool = True) -> "TransformationManager": + """One-hot encode categorical variables. + + Args: + column (str): The column to transform + drop_first (bool): Whether to drop the first category + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"color": ["red", "blue", "red"]}) + >>> manager = TransformationManager(df) + >>> result = manager.encode_categorical("color").df + >>> print(result) + color_red + 0 1 + 1 0 + 2 1 + """ + encoded = pd.get_dummies(self.df[column], prefix=column, drop_first=drop_first) + self.df = pd.concat([self.df.drop(columns=[column]), encoded], axis=1) + return self + + def validate_email(self, column: str, drop_invalid: bool = False) -> "TransformationManager": + """Validate email format in a column. + + Args: + column (str): The column to validate + drop_invalid (bool): If True, drop rows with invalid emails + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"email": ["user@example.com", "invalid.email"]}) + >>> manager = TransformationManager(df) + >>> result = manager.validate_email("email", drop_invalid=True).df + >>> print(result) + email + 0 user@example.com + """ + import re + email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' + + def is_valid_email(x): + if pd.isna(x): + return False + return bool(re.match(email_pattern, str(x))) + + if drop_invalid: + self.df = self.df[self.df[column].apply(is_valid_email)] + else: + self.df[f"{column}_valid"] = self.df[column].apply(is_valid_email) + return self + + def validate_date_range(self, column: str, start_date: str, end_date: str, + drop_invalid: bool = False) -> "TransformationManager": + """Validate dates are within a specified range. + + Args: + column (str): The column to validate + start_date (str): Minimum valid date (YYYY-MM-DD) + end_date (str): Maximum valid date (YYYY-MM-DD) + drop_invalid (bool): If True, drop rows with invalid dates + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"date": ["2024-01-01", "2025-12-31"]}) + >>> manager = TransformationManager(df) + >>> result = manager.validate_date_range( + ... "date", "2024-01-01", "2024-12-31", drop_invalid=True).df + >>> print(result) + date + 0 2024-01-01 + """ + start = pd.to_datetime(start_date) + end = pd.to_datetime(end_date) + + def is_valid_date(x): + if pd.isna(x): + return False + date = pd.to_datetime(x, errors='coerce') + if pd.isna(date): + return False + return start <= date <= end + + if drop_invalid: + self.df = self.df[self.df[column].apply(is_valid_date)] + else: + self.df[f"{column}_valid"] = self.df[column].apply(is_valid_date) + return self + + def normalize_phone(self, column: str, country_code: str = "+1") -> "TransformationManager": + """Normalize phone numbers to a standard format. + + Args: + column (str): The column to transform + country_code (str): Default country code to prepend + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"phone": ["123-456-7890", "(123) 456-7890"]}) + >>> manager = TransformationManager(df) + >>> result = manager.normalize_phone("phone", country_code="+1").df + >>> print(result) + phone + 0 +1-123-456-7890 + 1 +1-123-456-7890 + """ + def clean_phone(x): + if pd.isna(x): + return x + # Remove all non-digit characters + phone = ''.join(filter(str.isdigit, str(x))) + + # Handle different cases + if len(phone) == 10: # Standard US number + return f"{country_code}-{phone[:3]}-{phone[3:6]}-{phone[6:]}" + elif len(phone) > 10: # International number + return f"+{phone[:-10]}-{phone[-10:-7]}-{phone[-7:-4]}-{phone[-4:]}" + return x # Return original if format unknown + + self.df[column] = self.df[column].apply(clean_phone) + return self + + def remove_duplicates(self, columns: Union[str, List[str]], + keep: str = "first") -> "TransformationManager": + """Remove duplicate rows based on specified columns. + + Args: + columns (Union[str, List[str]]): Column(s) to identify duplicates + keep (str): Which duplicate to keep ('first', 'last', or False) + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({ + ... "id": [1, 2, 1], + ... "name": ["John", "Jane", "John"] + ... }) + >>> manager = TransformationManager(df) + >>> result = manager.remove_duplicates(["name"]).df + >>> print(result) + id name + 0 1 John + 1 2 Jane + """ + self.df = self.df.drop_duplicates(subset=columns, keep=keep) + return self + + def validate_foreign_key(self, column: str, ref_df: pd.DataFrame, + ref_column: str, drop_invalid: bool = False) -> "TransformationManager": + """Validate foreign key references against another DataFrame. + + Args: + column (str): The column containing foreign keys + ref_df (pd.DataFrame): Reference DataFrame + ref_column (str): Column in reference DataFrame to check against + drop_invalid (bool): If True, drop rows with invalid references + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> orders = pd.DataFrame({"user_id": [1, 2, 3]}) + >>> users = pd.DataFrame({"id": [1, 2]}) + >>> manager = TransformationManager(orders) + >>> result = manager.validate_foreign_key( + ... "user_id", users, "id", drop_invalid=True).df + >>> print(result) + user_id + 0 1 + 1 2 + """ + valid_values = set(ref_df[ref_column].unique()) + + def is_valid_reference(x): + if pd.isna(x): + return False + return x in valid_values + + if drop_invalid: + self.df = self.df[self.df[column].apply(is_valid_reference)] + else: + self.df[f"{column}_valid"] = self.df[column].apply(is_valid_reference) + return self + + def ensure_positive(self, column: str, drop_negative: bool = False) -> "TransformationManager": + """Ensure numeric values are positive. + + Args: + column (str): The column to validate + drop_negative (bool): If True, drop rows with negative values + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"score": [-1, 0, 1]}) + >>> manager = TransformationManager(df) + >>> result = manager.ensure_positive("score", drop_negative=True).df + >>> print(result) + score + 1 0 + 2 1 + """ + if drop_negative: + self.df = self.df[self.df[column] >= 0].copy() + else: + self.df[column] = self.df[column].clip(lower=0) + return self + + def standardize_categories(self, column: str, mapping: dict) -> "TransformationManager": + """Standardize categorical values using a mapping dictionary. + + Args: + column (str): The column to transform + mapping (dict): Dictionary mapping variations to standard forms + e.g., {"Apple Inc.": "Apple", "Apple Computer": "Apple"} + + Returns: + TransformationManager: Self for method chaining + + Example: + >>> df = pd.DataFrame({"company": ["Apple Inc.", "Apple Computer"]}) + >>> mapping = {"Apple Inc.": "Apple", "Apple Computer": "Apple"} + >>> manager = TransformationManager(df) + >>> result = manager.standardize_categories("company", mapping).df + >>> print(result) + company + 0 Apple + 1 Apple + """ + self.df[column] = self.df[column].replace(mapping) + return self + + def apply_transformations(self, transformations: Optional[List[dict]] = None) -> pd.DataFrame: """Apply a list of transformations to the DataFrame. Args: @@ -76,17 +762,55 @@ def apply_transformations( if not transformations: return self.df + # Map transformation types to their corresponding methods + transformation_handlers = { + "anonymize": lambda p: self.anonymize(p["column"]), + "convert_timezone": lambda p: self.convert_timezone(p["column"], p["to"]), + "to_lowercase": lambda p: self.to_lowercase(p["column"]), + "to_uppercase": lambda p: self.to_uppercase(p["column"]), + "strip": lambda p: self.strip(p["column"]), + "round_numbers": lambda p: self.round_numbers(p["column"], p["decimals"]), + "scale": lambda p: self.scale(p["column"], p["factor"]), + "format_date": lambda p: self.format_date(p["column"], p["format"]), + "to_numeric": lambda p: self.to_numeric(p["column"], p.get("errors", "coerce")), + "to_datetime": lambda p: self.to_datetime(p["column"], p.get("format"), p.get("errors", "coerce")), + "fill_na": lambda p: self.fill_na(p["column"], p["value"]), + "replace": lambda p: self.replace(p["column"], p["old_value"], p["new_value"]), + "extract": lambda p: self.extract(p["column"], p["pattern"]), + "truncate": lambda p: self.truncate(p["column"], p["length"], p.get("add_ellipsis", True)), + "pad": lambda p: self.pad(p["column"], p["width"], p.get("side", "left"), p.get("pad_char", " ")), + "clip": lambda p: self.clip(p["column"], p.get("lower"), p.get("upper")), + "bin": lambda p: self.bin(p["column"], p["bins"], p.get("labels")), + "normalize": lambda p: self.normalize(p["column"]), + "standardize": lambda p: self.standardize(p["column"]), + "map_values": lambda p: self.map_values(p["column"], p["mapping"]), + "encode_categorical": lambda p: self.encode_categorical(p["column"], p.get("drop_first", True)), + "validate_email": lambda p: self.validate_email(p["column"], p.get("drop_invalid", False)), + "validate_date_range": lambda p: self.validate_date_range( + p["column"], p["start_date"], p["end_date"], p.get("drop_invalid", False) + ), + "normalize_phone": lambda p: self.normalize_phone(p["column"], p.get("country_code", "+1")), + "remove_duplicates": lambda p: self.remove_duplicates(p["columns"], p.get("keep", "first")), + "validate_foreign_key": lambda p: self.validate_foreign_key( + p["column"], p["ref_df"], p["ref_column"], p.get("drop_invalid", False) + ), + "ensure_positive": lambda p: self.ensure_positive(p["column"], p.get("drop_negative", False)), + "standardize_categories": lambda p: self.standardize_categories(p["column"], p["mapping"]), + } + for transformation in transformations: - transformation_type = transformation.type - params = transformation.params + # Handle both dict and object transformations + if isinstance(transformation, dict): + transformation_type = transformation["type"] + params = transformation["params"] + else: + transformation_type = transformation.type + params = transformation.params - if transformation_type == "anonymize": - self.anonymize(params["column"]) - elif transformation_type == "convert_timezone": - self.convert_timezone(params["column"], params["to"]) + handler = transformation_handlers.get(transformation_type) + if handler: + handler(params) else: - raise UnsupportedTransformation( - f"Transformation type '{transformation_type}' is not supported" - ) + raise UnsupportedTransformation(f"Transformation type '{transformation_type}' is not supported") return self.df diff --git a/tests/unit_tests/data_loader/test_transformation_manager.py b/tests/unit_tests/data_loader/test_transformation_manager.py index f0b2530c3..146ade227 100644 --- a/tests/unit_tests/data_loader/test_transformation_manager.py +++ b/tests/unit_tests/data_loader/test_transformation_manager.py @@ -1,122 +1,515 @@ -from datetime import datetime - import pandas as pd import pytest - -from pandasai.data_loader.transformation_manager import TransformationManager +from datetime import datetime +import numpy as np from pandasai.exceptions import UnsupportedTransformation +from pandasai.data_loader.transformation_manager import TransformationManager class TestTransformationManager: def test_anonymize_email(self): """Test that email anonymization preserves domain but hides username.""" - df = pd.DataFrame( - { - "email": [ - "user1@example.com", - "user2@example.com", - "test.user@domain.org", - ] - } - ) - + df = pd.DataFrame({ + "email": ["user1@example.com", "user2@example.com", "test.user@domain.org"] + }) + manager = TransformationManager(df) result = manager.anonymize("email").df - + assert all(result["email"].str.contains("@")) assert "@example.com" in result.iloc[0]["email"] assert "@example.com" in result.iloc[1]["email"] assert "@domain.org" in result.iloc[2]["email"] assert not any(result["email"].isin(["user1@example.com", "user2@example.com"])) - + def test_anonymize_non_email(self): """Test that non-email values are completely anonymized.""" - df = pd.DataFrame({"name": ["John Doe", "Jane Smith"]}) - + df = pd.DataFrame({ + "name": ["John Doe", "Jane Smith"] + }) + manager = TransformationManager(df) result = manager.anonymize("name").df - + assert result.iloc[0]["name"] == "*" * len("John Doe") assert result.iloc[1]["name"] == "*" * len("Jane Smith") - + def test_anonymize_handles_na(self): """Test that NA values are preserved during anonymization.""" - df = pd.DataFrame({"email": ["user@example.com", None, pd.NA]}) - + df = pd.DataFrame({ + "email": ["user@example.com", None, pd.NA] + }) + manager = TransformationManager(df) result = manager.anonymize("email").df - + assert pd.isna(result.iloc[1]["email"]) assert pd.isna(result.iloc[2]["email"]) - + def test_convert_timezone(self): """Test timezone conversion.""" - df = pd.DataFrame( - { - "timestamp": pd.to_datetime( - ["2023-01-01T12:00:00+00:00", "2023-01-02T15:30:00+00:00"] - ) - } - ) - + df = pd.DataFrame({ + "timestamp": pd.to_datetime([ + "2023-01-01T12:00:00+00:00", + "2023-01-02T15:30:00+00:00" + ]) + }) + manager = TransformationManager(df) result = manager.convert_timezone("timestamp", "US/Pacific").df - + assert all(ts.tzinfo is not None for ts in result["timestamp"]) - assert all( - ts.tzname() == "PST" or ts.tzname() == "PDT" for ts in result["timestamp"] - ) - - def test_apply_multiple_transformations(self): - """Test applying multiple transformations in sequence.""" - df = pd.DataFrame( - { - "email": ["user1@example.com", "user2@example.com"], - "timestamp": pd.to_datetime( - ["2023-01-01T12:00:00+00:00", "2023-01-02T15:30:00+00:00"] - ), - } - ) + assert all(ts.tzname() == "PST" for ts in result["timestamp"]) + + def test_to_lowercase(self): + """Test converting strings to lowercase.""" + df = pd.DataFrame({ + "text": ["Hello WORLD", "PyThOn", "TEST"] + }) + + manager = TransformationManager(df) + result = manager.to_lowercase("text").df + + assert all(result["text"] == ["hello world", "python", "test"]) + + def test_to_uppercase(self): + """Test converting strings to uppercase.""" + df = pd.DataFrame({ + "text": ["Hello World", "Python", "test"] + }) + + manager = TransformationManager(df) + result = manager.to_uppercase("text").df + + assert all(result["text"] == ["HELLO WORLD", "PYTHON", "TEST"]) + + def test_strip(self): + """Test stripping whitespace.""" + df = pd.DataFrame({ + "text": [" hello ", " world ", "python\n"] + }) + + manager = TransformationManager(df) + result = manager.strip("text").df + + assert all(result["text"] == ["hello", "world", "python"]) + + def test_round_numbers(self): + """Test rounding numbers.""" + df = pd.DataFrame({ + "numbers": [1.23456, 2.78901, 3.5] + }) + + manager = TransformationManager(df) + result = manager.round_numbers("numbers", 2).df + + assert all(result["numbers"] == [1.23, 2.79, 3.50]) + + def test_scale(self): + """Test scaling numbers.""" + df = pd.DataFrame({ + "numbers": [1, 2, 3] + }) + + manager = TransformationManager(df) + result = manager.scale("numbers", 2.5).df + + assert all(result["numbers"] == [2.5, 5.0, 7.5]) + + def test_format_date(self): + """Test date formatting.""" + df = pd.DataFrame({ + "date": pd.to_datetime([ + "2023-01-01", + "2023-12-31" + ]) + }) + + manager = TransformationManager(df) + result = manager.format_date("date", "%d/%m/%Y").df + + assert all(result["date"] == ["01/01/2023", "31/12/2023"]) + + def test_to_numeric(self): + """Test converting to numeric.""" + df = pd.DataFrame({ + "mixed": ["1", "2.5", "invalid", "3"] + }) + + manager = TransformationManager(df) + result = manager.to_numeric("mixed").df + + assert result["mixed"].dtype == np.float64 + assert pd.isna(result.iloc[2]["mixed"]) + assert result.iloc[3]["mixed"] == 3.0 + + def test_to_datetime(self): + """Test converting to datetime.""" + df = pd.DataFrame({ + "dates": ["2023-01-01", "2023-12-31", "invalid"] + }) + + manager = TransformationManager(df) + result = manager.to_datetime("dates").df + + assert pd.api.types.is_datetime64_any_dtype(result["dates"]) + assert pd.isna(result.iloc[2]["dates"]) + + def test_fill_na(self): + """Test filling NA values.""" + df = pd.DataFrame({ + "values": [1, None, pd.NA, 4] + }) + + manager = TransformationManager(df) + result = manager.fill_na("values", 0).df + + assert all(result["values"] == [1, 0, 0, 4]) + def test_method_chaining(self): + """Test that methods can be chained.""" + df = pd.DataFrame({ + "text": [" Hello ", " World "], + "numbers": [1.23456, 2.78901] + }) + + manager = TransformationManager(df) + result = (manager + .strip("text") + .to_uppercase("text") + .round_numbers("numbers", 2) + .df) + + assert all(result["text"] == ["HELLO", "WORLD"]) + assert all(result["numbers"] == [1.23, 2.79]) + + def test_apply_transformations(self): + """Test applying multiple transformations through configuration.""" + df = pd.DataFrame({ + "email": ["user1@example.com", "user2@example.com"], + "timestamp": pd.to_datetime([ + "2023-01-01T12:00:00+00:00", + "2023-01-02T15:30:00+00:00" + ]), + "text": [" Hello ", " World "], + "numbers": [1.23456, 2.78901] + }) + transformations = [ - type( - "Transformation", - (), - {"type": "anonymize", "params": {"column": "email"}}, - )(), - type( - "Transformation", - (), - { - "type": "convert_timezone", - "params": {"column": "timestamp", "to": "UTC"}, - }, - )(), + type("Transformation", (), { + "type": "anonymize", + "params": {"column": "email"} + })(), + type("Transformation", (), { + "type": "convert_timezone", + "params": {"column": "timestamp", "to": "UTC"} + })(), + type("Transformation", (), { + "type": "strip", + "params": {"column": "text"} + })(), + type("Transformation", (), { + "type": "round_numbers", + "params": {"column": "numbers", "decimals": 2} + })() ] - + manager = TransformationManager(df) result = manager.apply_transformations(transformations) - + # Check anonymization assert all(result["email"].str.contains("@example.com")) assert not any(result["email"].isin(["user1@example.com", "user2@example.com"])) - + # Check timezone conversion assert all(ts.tzinfo is not None for ts in result["timestamp"]) assert all(ts.tzname() == "UTC" for ts in result["timestamp"]) - + + # Check strip + assert all(result["text"] == ["Hello", "World"]) + + # Check rounding + assert all(result["numbers"] == [1.23, 2.79]) + def test_unsupported_transformation(self): """Test that unsupported transformation type raises exception.""" df = pd.DataFrame({"col": [1, 2, 3]}) - + transformations = [ - type( - "Transformation", - (), - {"type": "unsupported_type", "params": {"column": "col"}}, - )() + type("Transformation", (), { + "type": "unsupported_type", + "params": {"column": "col"} + })() ] - + manager = TransformationManager(df) with pytest.raises(UnsupportedTransformation): manager.apply_transformations(transformations) + + def test_replace(self): + """Test replacing text.""" + df = pd.DataFrame({ + "text": ["hello world", "world hello", "hello hello"] + }) + + manager = TransformationManager(df) + result = manager.replace("text", "hello", "hi").df + + assert all(result["text"] == ["hi world", "world hi", "hi hi"]) + + def test_extract(self): + """Test extracting text with regex.""" + df = pd.DataFrame({ + "text": ["user123@example.com", "test456@domain.org"] + }) + + manager = TransformationManager(df) + result = manager.extract("text", r"(\d+)").df + + assert all(result["text"] == ["123", "456"]) + + def test_truncate(self): + """Test string truncation.""" + df = pd.DataFrame({ + "text": ["very long text", "short", "another long text"] + }) + + manager = TransformationManager(df) + result = manager.truncate("text", 8, add_ellipsis=True).df + + assert result["text"].tolist() == ["very...", "short", "anoth..."] + + def test_pad(self): + """Test string padding.""" + df = pd.DataFrame({ + "text": ["123", "4567", "89"] + }) + + manager = TransformationManager(df) + result = manager.pad("text", width=5, side="left", pad_char="0").df + + assert all(result["text"] == ["00123", "04567", "00089"]) + + def test_clip(self): + """Test clipping numeric values.""" + df = pd.DataFrame({ + "numbers": [1, 5, 10, 15, 20] + }) + + manager = TransformationManager(df) + result = manager.clip("numbers", lower=5, upper=15).df + + assert all(result["numbers"] == [5, 5, 10, 15, 15]) + + def test_bin(self): + """Test binning continuous data.""" + df = pd.DataFrame({ + "numbers": [1, 5, 10, 15, 20] + }) + + manager = TransformationManager(df) + result = manager.bin( + "numbers", + bins=[0, 10, 20], + labels=["low", "high"] + ).df + + assert result["numbers"].tolist() == ["low", "low", "low", "high", "high"] + + def test_normalize(self): + """Test min-max normalization.""" + df = pd.DataFrame({ + "numbers": [1, 3, 5] + }) + + manager = TransformationManager(df) + result = manager.normalize("numbers").df + + assert result["numbers"].tolist() == [0.0, 0.5, 1.0] + + def test_standardize(self): + """Test z-score standardization.""" + df = pd.DataFrame({ + "numbers": [1, 2, 3] + }) + + manager = TransformationManager(df) + result = manager.standardize("numbers").df + + # Mean should be 0 and std should be 1 + assert abs(result["numbers"].mean()) < 1e-10 + assert abs(result["numbers"].std() - 1) < 1e-10 + + def test_map_values(self): + """Test mapping values.""" + df = pd.DataFrame({ + "categories": ["A", "B", "C", "A"] + }) + + mapping = {"A": "Category A", "B": "Category B", "C": "Category C"} + manager = TransformationManager(df) + result = manager.map_values("categories", mapping).df + + assert all(result["categories"] == [ + "Category A", "Category B", "Category C", "Category A" + ]) + + def test_encode_categorical(self): + """Test one-hot encoding.""" + df = pd.DataFrame({ + "categories": ["A", "B", "A", "C"] + }) + + manager = TransformationManager(df) + result = manager.encode_categorical("categories", drop_first=True).df + + # Should have n-1 columns for n categories when drop_first=True + assert "categories_B" in result.columns + assert "categories_C" in result.columns + assert "categories_A" not in result.columns + assert result.shape[1] == 2 + + def test_complex_transformation_chain(self): + """Test a complex chain of transformations.""" + df = pd.DataFrame({ + "text": [" Hello World ", "Python ", " Data Science "], + "numbers": [1.23456, -5.6789, 10.0], + "categories": ["A", "B", "A"] + }) + + manager = TransformationManager(df) + result = (manager + .strip("text") + .to_uppercase("text") + .truncate("text", 8) + .clip("numbers", lower=0, upper=None) + .round_numbers("numbers", 2) + .encode_categorical("categories") + .df) + + assert result["text"].tolist() == ["HELLO...", "PYTHON", "DATA..."] + assert result["numbers"].tolist() == [1.23, 0.00, 10.00] + assert "categories_B" in result.columns + + def test_validate_email(self): + """Test email validation.""" + df = pd.DataFrame({ + "email": ["user@example.com", "invalid.email", "another@domain.com", None] + }) + + manager = TransformationManager(df) + result = manager.validate_email("email").df + + assert result["email_valid"].tolist() == [True, False, True, False] + + result = manager.validate_email("email", drop_invalid=True).df + assert len(result) == 2 + assert all(result["email"].isin(["user@example.com", "another@domain.com"])) + + def test_validate_date_range(self): + """Test date range validation.""" + df = pd.DataFrame({ + "date": ["2023-01-01", "2024-06-15", "2025-12-31", "invalid", None] + }) + + manager = TransformationManager(df) + result = manager.validate_date_range( + "date", "2023-01-01", "2024-12-31" + ).df + + assert result["date_valid"].tolist() == [True, True, False, False, False] + + result = manager.validate_date_range( + "date", "2023-01-01", "2024-12-31", drop_invalid=True + ).df + assert len(result) == 2 + assert all(result["date"].isin(["2023-01-01", "2024-06-15"])) + + def test_normalize_phone(self): + """Test phone number normalization.""" + df = pd.DataFrame({ + "phone": ["1234567890", "(123) 456-7890", "+44 20 7123 4567", None] + }) + + manager = TransformationManager(df) + result = manager.normalize_phone("phone").df + + expected = [ + "+1-123-456-7890", + "+1-123-456-7890", + "+44-207-123-4567", + None + ] + assert result["phone"].tolist() == expected + + def test_remove_duplicates(self): + """Test duplicate removal.""" + df = pd.DataFrame({ + "id": [1, 2, 1, 3], + "value": ["a", "b", "a", "c"] + }) + + manager = TransformationManager(df) + result = manager.remove_duplicates("id").df + + assert len(result) == 3 + assert result["id"].tolist() == [1, 2, 3] + + def test_validate_foreign_key(self): + """Test foreign key validation.""" + users_df = pd.DataFrame({ + "user_id": [1, 2, 3] + }) + + orders_df = pd.DataFrame({ + "order_id": [1, 2, 3, 4], + "user_id": [1, 2, 4, None] + }) + + manager = TransformationManager(orders_df) + result = manager.validate_foreign_key( + "user_id", users_df, "user_id" + ).df + + assert result["user_id_valid"].tolist() == [True, True, False, False] + + result = manager.validate_foreign_key( + "user_id", users_df, "user_id", drop_invalid=True + ).df + assert len(result) == 2 + assert all(result["user_id"].isin([1, 2])) + + def test_ensure_positive(self): + """Test ensuring positive values.""" + df = pd.DataFrame({ + "amount": [100, -50, 0, 75, -25] + }) + + manager = TransformationManager(df) + result = manager.ensure_positive("amount").df + assert result["amount"].tolist() == [100, 0, 0, 75, 0] + + # Create a new manager to start fresh + manager = TransformationManager(df) + result = manager.ensure_positive("amount", drop_negative=True).df + assert len(result) == 3 + assert result["amount"].tolist() == [100, 0, 75] + + def test_standardize_categories(self): + """Test category standardization.""" + df = pd.DataFrame({ + "company": ["Apple Inc.", "Apple Computer", "Microsoft Corp", "MS"] + }) + + mapping = { + "Apple Inc.": "Apple", + "Apple Computer": "Apple", + "Microsoft Corp": "Microsoft", + "MS": "Microsoft" + } + + manager = TransformationManager(df) + result = manager.standardize_categories("company", mapping).df + + expected = ["Apple", "Apple", "Microsoft", "Microsoft"] + assert result["company"].tolist() == expected