import pandas as pd
import numpy as np
from .errors import *
[docs]
class Getter:
# cleaning functions
[docs]
def get_best_int(col_min: int, col_max: int) -> str:
"""
Determines the smallest integer type capable of representing a range of values.
Args:
col_min (int): The minimum value in the range.
col_max (int): The maximum value in the range.
Returns:
str: The name of the smallest integer type that can accommodate all values
in the range. Possible returns are "int8", "int16", "int32", or "int64".
Examples:
>>> from suntzu import Getter
>>> Getter.get_best_int(-50, 100)
'int8'
>>> Getter.get_best_int(-200, 30000)
'int16'
>>> Getter.get_best_int(-50000, 100000)
'int32'
>>> Getter.get_best_int(-5000000000, 5000000000)
'int64'
"""
if col_min >= -128 and col_max <= 127:
return "int8"
elif col_min >= -32768 and col_max <= 32767:
return "int16"
elif col_min >= -2147483648 and col_max <= 2147483647:
return "int32"
else:
return "int64"
[docs]
def get_best_float(col_min: float, col_max: float) -> str:
"""
Determines the most memory-efficient floating-point type capable of representing
a range of values.
Args:
col_min (float): The minimum value in the range.
col_max (float): The maximum value in the range.
Returns:
str: The name of the smallest floating-point type that can accommodate all values
in the range. Possible returns are "float16", "float32", or "float64".
Examples:
>>> from suntzu import Getter
>>> Getter.get_best_float(0.1, 100.0)
'float16'
>>> Getter.get_best_float(-1e5, 1e5)
'float32'
>>> Getter.get_best_float(-1e40, 1e40)
'float64'
"""
if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
return "float16"
elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
return "float32"
else:
return "float64"
[docs]
def get_best_dtype(self, col: str) -> str:
"""
Determines the most memory-efficient data type for a column based on its values.
The method inspects the column's current data type and value range to infer
a more optimal dtype:
- Integers are downcast to the smallest possible integer type.
- Floats are downcast to the smallest possible floating-point type.
- Object columns with a low number of unique values are converted to category.
- Other types are returned unchanged.
Args:
col (str): Name of the column to analyze.
Returns:
str: The name of the most suitable data type for the column.
Examples:
>>> from suntzu import Getter
>>> Getter.get_best_dtype(df, "age")
'int8'
>>> Getter.get_best_dtype(df, "price")
'float32'
>>> Getter.get_best_dtype(df, "status")
'category'
"""
dtype = self[col].dtype.name # returns int || float || category || bool
col_min = Getter.get_min_value(self, col)
col_max = Getter.get_max_value(self, col)
if "int" in dtype:
dtype = Getter.get_best_int(col_min, col_max)
elif "float" in dtype:
dtype = Getter.get_best_float(col_min, col_max)
elif dtype == "object":
if self[col].nunique() <= 10:
dtype = "category"
return dtype
# statistics functions
[docs]
def get_max_value(self: pd.DataFrame, col: str) -> int | str:
"""
Returns the maximum value of a DataFrame column, handling different data types appropriately.
Args:
col (str): The column of the DataFrame to inspect.
Returns:
int | str:
- For numeric columns, returns the maximum value.
- For categorical or boolean columns, returns the most frequent value (mode).
Raises:
MixedDtypeError: If the column contains mixed types or null values.
Examples:
>>> from suntzu import Getter
>>> import pandas as pd
>>> df = pd.DataFrame({'a': [1, 3, 2], 'b': [True, False, True], 'c': ['x', 'y', 'x']})
>>> Getter.get_max_value(df, 'a')
3
>>> Getter.get_max_value(df, 'b')
True
>>> Getter.get_max_value(df, 'c')
'x'
"""
dtype = self[col].dtype.name
try:
if not dtype in ["categorical", "bool", "object"]:
value = self[col].max()
else:
value = self[col].mode()[0]
except TypeError:
raise MixedDtypeError(f"Column '{col}' contains mixed types (e.g., str + float) or null values. Please try cleaning it.")
return value
[docs]
def get_min_value(self: pd.DataFrame, col: str) -> int | str:
"""
Returns the minimum value of a DataFrame column, handling different data types appropriately.
Args:
col (str): The column of the DataFrame to inspect.
Returns:
int | str:
- For numeric columns, returns the minimum value.
- For categorical or boolean columns, returns the least frequent value.
Raises:
MixedDtypeError: If the column contains mixed types or null values.
Examples:
>>> from suntzu import Getter
>>> import pandas as pd
>>> df = pd.DataFrame({'a': [1, 3, 2], 'b': [True, False, True], 'c': ['x', 'y', 'x']})
>>> Getter.get_min_value(df, 'a')
1
>>> Getter.get_min_value(df, 'b')
False
>>> Getter.get_min_value(df, 'c')
'y'
"""
dtype = self[col].dtype.name
try:
if not dtype in ["categorical", "bool", "object"]:
value = self[col].min()
else:
value = self[col].value_counts().idxmin()
except TypeError:
raise MixedDtypeError(f"Column '{col}' contains mixed types (e.g., str + float) or null values. Please try cleaning it.")
return value
[docs]
def get_memory_usage(self, col, unit) -> float:
"""
Calculates the memory usage of a specific column in the DataFrame.
Args:
col (str): Name of the column to measure.
unit (str): Unit for memory measurement. Options are:
- "b" for bytes
- "kb" for kilobytes
- "mb" for megabytes
Returns:
float: Memory usage of the specified column, rounded to 2 decimal places.
Examples:
>>> df.get_memory_usage("age", "kb")
12.5
>>> df.get_memory_usage("price", "mb")
0.01
"""
conversion_factors = {
"kb": 1024,
"mb": 1024**2,
"b": 1
}
conversion_factor = conversion_factors[unit]
memory_usage = self[col].memory_usage(deep=True)
value_numeric = round(memory_usage / conversion_factor, 2)
return value_numeric
[docs]
def get_total_memory_usage(self, unit) -> float:
"""
Calculates the total memory usage of the DataFrame in the specified unit.
Args:
unit (str): Unit for memory measurement. Options are:
- "b" for bytes
- "kb" for kilobytes
- "mb" for megabytes
Returns:
float: Total memory usage of the DataFrame, rounded to 2 decimal places.
Examples:
>>> df.get_total_memory_usage("kb")
125.5
>>> df.get_total_memory_usage("mb")
0.12
"""
conversion_factors = {
"kb": 1024,
"mb": 1024**2,
"b": 1
}
conversion_factor = conversion_factors[unit]
total_usage = self.memory_usage(deep=True).sum()
total_usage = round(total_usage / conversion_factor, 2)
return total_usage
[docs]
def get_memory_insights(self, col:str, total_usage: int) -> list:
nulls_count: int = self[col].isnull().sum()
col_size: int = len(self[col])
value_numeric = Getter.get_memory_usage(self, col, "kb")
value_percentage = round((value_numeric/total_usage)*100, 2)
try:
col_info: list[str] = [
col,
self[col].dtype.name,
Getter.get_best_dtype(self, col),
f"{value_numeric} kb",
f"{value_percentage}%",
nulls_count,
f"{round(nulls_count/col_size, 2)}%",
self[col].nunique(),
]
# This error stops the whole function so we handle it to continue and give a warning
except MixedDtypeError:
print(f"WARNING: {col} has missing values, so the best dtype could not be found")
col_info: list[str] = [
col,
self[col].dtype.name,
"???",
f"{value_numeric} kb",
f"{value_percentage}%",
nulls_count,
f"{round(nulls_count/col_size, 2)}%",
self[col].nunique(),
]
finally:
return col_info