Source code for suntzu.getter

import pandas as pd
import numpy as np

from .errors import *


[docs]
class Getter:
    # cleaning functions

[docs]
    def get_best_int(col_min: int, col_max: int) -> str:
        """
        Determines the smallest integer type capable of representing a range of values.

        Args:
            col_min (int): The minimum value in the range.
            col_max (int): The maximum value in the range.

        Returns:
            str: The name of the smallest integer type that can accommodate all values 
                in the range. Possible returns are "int8", "int16", "int32", or "int64".

        Examples:
            >>> from suntzu import Getter
            >>> Getter.get_best_int(-50, 100)
            'int8'
            >>> Getter.get_best_int(-200, 30000)
            'int16'
            >>> Getter.get_best_int(-50000, 100000)
            'int32'
            >>> Getter.get_best_int(-5000000000, 5000000000)
            'int64'
        """

        if col_min >= -128 and col_max <= 127:
            return "int8"
        elif col_min >= -32768 and col_max <= 32767:
            return "int16"
        elif col_min >= -2147483648 and col_max <= 2147483647:
            return "int32"
        else:
            return "int64"

    

[docs]
    def get_best_float(col_min: float, col_max: float) -> str:
        """
        Determines the most memory-efficient floating-point type capable of representing 
        a range of values.

        Args:
            col_min (float): The minimum value in the range.
            col_max (float): The maximum value in the range.

        Returns:
            str: The name of the smallest floating-point type that can accommodate all values 
                in the range. Possible returns are "float16", "float32", or "float64".

        Examples:
            >>> from suntzu import Getter
            >>> Getter.get_best_float(0.1, 100.0)
            'float16'
            >>> Getter.get_best_float(-1e5, 1e5)
            'float32'
            >>> Getter.get_best_float(-1e40, 1e40)
            'float64'
        """
        if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
            return "float16"
        elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
            return "float32"
        else:
            return "float64"

        

    

[docs]
    def get_best_dtype(self, col: str) -> str:
        """
        Determines the most memory-efficient data type for a column based on its values.

        The method inspects the column's current data type and value range to infer
        a more optimal dtype:
        - Integers are downcast to the smallest possible integer type.
        - Floats are downcast to the smallest possible floating-point type.
        - Object columns with a low number of unique values are converted to category.
        - Other types are returned unchanged.

        Args:
        col (str): Name of the column to analyze.

        Returns:
        str: The name of the most suitable data type for the column.

        Examples:
            >>> from suntzu import Getter
            >>> Getter.get_best_dtype(df, "age")
            'int8'
            >>> Getter.get_best_dtype(df, "price")
            'float32'
            >>> Getter.get_best_dtype(df, "status")
            'category'
        """
        dtype = self[col].dtype.name # returns int || float || category || bool
        col_min = Getter.get_min_value(self, col)
        col_max = Getter.get_max_value(self, col)
        if "int" in dtype:
            dtype = Getter.get_best_int(col_min, col_max)
        elif "float" in dtype:
            dtype = Getter.get_best_float(col_min, col_max)
        elif dtype == "object":
            if self[col].nunique() <= 10:
                dtype = "category"
        return dtype

    
    # statistics functions

[docs]
    def get_max_value(self: pd.DataFrame, col: str) -> int | str:
        """
        Returns the maximum value of a DataFrame column, handling different data types appropriately.

        Args:
            col (str): The column of the DataFrame to inspect.

        Returns:
            int | str: 
                - For numeric columns, returns the maximum value.
                - For categorical or boolean columns, returns the most frequent value (mode).

        Raises:
            MixedDtypeError: If the column contains mixed types or null values.

        Examples:
            >>> from suntzu import Getter
            >>> import pandas as pd
            >>> df = pd.DataFrame({'a': [1, 3, 2], 'b': [True, False, True], 'c': ['x', 'y', 'x']})
            >>> Getter.get_max_value(df, 'a')
            3
            >>> Getter.get_max_value(df, 'b')
            True
            >>> Getter.get_max_value(df, 'c')
            'x'
        """

        dtype = self[col].dtype.name
        try:
            if not dtype in ["categorical", "bool", "object"]:
                value = self[col].max()
            else:
                value = self[col].mode()[0]
        except TypeError:
            raise MixedDtypeError(f"Column '{col}' contains mixed types (e.g., str + float) or null values. Please try cleaning it.")

        return value


[docs]
    def get_min_value(self: pd.DataFrame, col: str) -> int | str:
        """
        Returns the minimum value of a DataFrame column, handling different data types appropriately.

        Args:
            col (str): The column of the DataFrame to inspect.

        Returns:
            int | str: 
                - For numeric columns, returns the minimum value.
                - For categorical or boolean columns, returns the least frequent value.

        Raises:
            MixedDtypeError: If the column contains mixed types or null values.

        Examples:
            >>> from suntzu import Getter
            >>> import pandas as pd
            >>> df = pd.DataFrame({'a': [1, 3, 2], 'b': [True, False, True], 'c': ['x', 'y', 'x']})
            >>> Getter.get_min_value(df, 'a')
            1
            >>> Getter.get_min_value(df, 'b')
            False
            >>> Getter.get_min_value(df, 'c')
            'y'
        """

        dtype = self[col].dtype.name
        try:
            if not dtype in ["categorical", "bool", "object"]:
                value = self[col].min()
            else:
                value = self[col].value_counts().idxmin()
        except TypeError:
            raise MixedDtypeError(f"Column '{col}' contains mixed types (e.g., str + float) or null values. Please try cleaning it.")

        return value


    

[docs]
    def get_memory_usage(self, col, unit) -> float:
        """
        Calculates the memory usage of a specific column in the DataFrame.

        Args:
            col (str): Name of the column to measure.
            unit (str): Unit for memory measurement. Options are:
                - "b" for bytes
                - "kb" for kilobytes
                - "mb" for megabytes

        Returns:
            float: Memory usage of the specified column, rounded to 2 decimal places.

        Examples:
            >>> df.get_memory_usage("age", "kb")
            12.5
            >>> df.get_memory_usage("price", "mb")
            0.01
        """
        conversion_factors = {
            "kb": 1024,
            "mb": 1024**2,
            "b": 1
        }
        conversion_factor = conversion_factors[unit]
        
        memory_usage = self[col].memory_usage(deep=True)
        value_numeric = round(memory_usage / conversion_factor, 2)
        
    
        return value_numeric


[docs]
    def get_total_memory_usage(self, unit) -> float:
        """
        Calculates the total memory usage of the DataFrame in the specified unit.

        Args:
            unit (str): Unit for memory measurement. Options are:
                - "b" for bytes
                - "kb" for kilobytes
                - "mb" for megabytes

        Returns:
            float: Total memory usage of the DataFrame, rounded to 2 decimal places.

        Examples:
            >>> df.get_total_memory_usage("kb")
            125.5
            >>> df.get_total_memory_usage("mb")
            0.12
        """
        conversion_factors = {
            "kb": 1024,
            "mb": 1024**2,
            "b": 1
        }
        conversion_factor = conversion_factors[unit]
        total_usage = self.memory_usage(deep=True).sum()
        total_usage = round(total_usage / conversion_factor, 2)
        return total_usage

    

[docs]
    def get_memory_insights(self, col:str, total_usage: int) -> list:

        nulls_count: int = self[col].isnull().sum()
        col_size: int = len(self[col]) 
        value_numeric = Getter.get_memory_usage(self, col, "kb")
        
        value_percentage = round((value_numeric/total_usage)*100, 2)
        
        try:
            col_info: list[str] = [  
                col,  
                self[col].dtype.name,  
                Getter.get_best_dtype(self, col),  
                f"{value_numeric} kb",  
                f"{value_percentage}%",  
                nulls_count,  
                f"{round(nulls_count/col_size, 2)}%",  
                self[col].nunique(),  
            ]
        # This error stops the whole function so we handle it to continue and give a warning
        except MixedDtypeError:
            print(f"WARNING: {col} has missing values, so the best dtype could not be found")
            col_info: list[str] = [  
                col,  
                self[col].dtype.name,  
                "???",  
                f"{value_numeric} kb",
                f"{value_percentage}%",
                nulls_count,  
                f"{round(nulls_count/col_size, 2)}%",  
                self[col].nunique(),  
            ] 
        finally:
            return col_info