Source code for src.clean

"""
Clean the dataset before modeling.
"""
import logging
from time import time

import pandas as pd

logger = logging.getLogger(__name__)

# Some albums lack a record label in the dataset, even though
# they have one in reality. Correct these ones manually.
FILL_MISSING_RECORDLABEL_DATA = (
    "Fool's Gold",          # Run the Jewels
    "Vapor",                # 808s and Dark Grapes III
    "101 Distribution",     # Dedication 2
    "Jet Life",             # The Drive In Theatre
    "Espo",                 # Animals
    "Cinematic",            # 1999
    "Def Jam",              # Rich Forever
    "LM Dupli-Cation",      # Cervantine
    "Glory Boyz",           # Back From the Dead
    "Epic",                 # Drilluminati
    "Self-released",        # Community Service 2!
    "Cash Money",           # Sorry 4 the Wait
    "Grand Hustle",         # Fuck a Mixtape
    "Vice",                 # Blue Chips
    "Free Bandz",           # 56 Nights
    "Six Shooter Records",  # Retribution
    "Self-released",        # Acid Rap
    "Maybach",              # Dreamchasers
    "Self-released",        # White Mystery
    "Top Dawg",             # Cilvia Demo
    "Triple X",             # Winter Hill
    "1017",                 # 1017 Thug
    "Rostrum",              # Kush and Orange Juice
    "BasedWorld",           # God's Father
    "10.Deep",              # The Mixtape About Nothing
    "Self-released"         # Coloring Book
)


[docs]def clean_dataset(data: pd.DataFrame, config) -> pd.DataFrame: """ Perform full data processing pipeline. Args: data (:obj:`pandas.DataFrame`): Raw data config (dict): Config file as read in by PyYAML Returns: :obj:`pandas.DataFrame` of cleaned data """ start_time = time() # Perform cleaning steps specified in config file if "fill_na_with_str" in config: data = fill_na_with_str(data, **config["fill_na_with_str"]["iteration1"]) data = fill_na_with_str(data, **config["fill_na_with_str"]["iteration2"]) if "convert_str_to_datetime" in config: data = convert_str_to_datetime(data, **config["convert_str_to_datetime"]) if "approximate_missing_year" in config: data = approximate_missing_year(data, **config["approximate_missing_year"]) if "convert_datetime_to_date" in config: data = convert_datetime_to_date(data, **config["convert_datetime_to_date"]) if "fill_missing_manually" in config: data = fill_missing_manually(data, **config["fill_missing_manually"]) if "strip_whitespace" in config: data = strip_whitespace(data, **config["strip_whitespace"]) if "bucket_values_together" in config: data = bucket_values_together(data, **config["bucket_values_together"]["iteration1"]) data = bucket_values_together(data, **config["bucket_values_together"]["iteration2"]) logger.info("Completed data cleaning process. Time taken: %0.4fs", time() - start_time) return data
[docs]def convert_str_to_datetime( data: pd.DataFrame, colname: str = "reviewdate", datetime_format: str = "%B %d %Y" ) -> pd.DataFrame: """ Parse a string column to datetime format. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str, optional): Name of column to apply transformation to. Defaults to "reviewdate". datetime_format (str, optional): Datetime format of column. Defaults to "%B %d %Y". For more info on these codes: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data # Convert to datetime data[colname] = pd.to_datetime(data[colname], format=datetime_format) logger.debug("Converted column %s to datetime format", colname) return data
[docs]def convert_datetime_to_date(data: pd.DataFrame, colname: str = "reviewdate") -> pd.DataFrame: """ Remove the time component of a datetime column. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str, optional): Name of column to apply transformation to. Defaults to "reviewdate". Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data # Extract date component data[colname] = data[colname].dt.date logger.debug("Converted column %s to date format", colname) return data
[docs]def approximate_missing_year( data: pd.DataFrame, fill_column: str = "releaseyear", approximate_with: str = "reviewdate" ) -> pd.DataFrame: """ Fill missing values in one column with the year of another datetime column. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean fill_column (str): Name of column to fill values in approximate_with (str): Name of `datetime` column to pull year from Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if either of the specified columns is not present if fill_column not in data.columns: logger.warning("%s not found in columns. Returning original data.", fill_column) return data if approximate_with not in data.columns: logger.warning("%s not found in columns. Returning original data.", approximate_with) return data nrows_affected = len(data.loc[pd.isna(data[fill_column])].index) # Fill values with year component of datetime column data.loc[pd.isna(data[fill_column]), fill_column] = \ data[pd.isna(data[fill_column])].loc[:, approximate_with].dt.year logger.debug("Filled missing values in %s with year from %s", fill_column, approximate_with) logger.debug("Number of rows affected: %d", nrows_affected) return data
[docs]def fill_missing_manually( data: pd.DataFrame, colname: str = "recordlabel", fill_with: tuple = FILL_MISSING_RECORDLABEL_DATA ) -> pd.DataFrame: """ Manually fill missing values. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str, optional): Name of column to apply transformation to. Defaults to "recordlabel". fill_with (iterable): Corrected values to replace missing values with. Data type depends on the column being filled. Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data # Drop in corrected values fill_missing = pd.Series(data=fill_with, index=data[pd.isna(data[colname])].index) data.loc[pd.isna(data[colname]), colname] = fill_missing logger.debug( "Manually filled in missing values for %d missing rows in column %s", len(fill_missing.index), colname ) return data
[docs]def strip_whitespace(data: pd.DataFrame, colname: str = "recordlabel") -> pd.DataFrame: """ Trim extra whitespace from values in a column. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str, optional): Name of column to apply transformation to. Defaults to "recordlabel". Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data data[colname] = data[colname].apply(str.strip) logger.debug("Trimmed extra whitespace in column %s", colname) return data
[docs]def bucket_values_together(data: pd.DataFrame, colname: str, values: list, replace_with: list) -> pd.DataFrame: """ Replace one or more values with a single value. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str): Name of column to apply transformation to values (iterable): Iterable of values to replace. replace_with: Value to replace with. Returns: Cleaned :obj:`pandas.DataFrame` Raises: `TypeError` if a single `str` object is passed to `values`. Since a `str` in Python is simply a list of characters, this doesn't immediately register as bad input, and logically doesn't really make sense for this method. """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data if isinstance(values, str): logger.error("""Error: Received a single string "%s" instead of an iterable of values to bucket together.""", values) raise TypeError("`bucket_values_together` requires an iterable of values, not a str") nrows_affected = 0 # For every old value to replace, swap out for the new value for value in values: nrows_affected += len(data.loc[data[colname] == value, colname].index) data.loc[data[colname] == value, colname] = replace_with logger.debug( "Replaced values (%s) with %s in column %s", ", ".join(map(str, values)), replace_with, colname ) logger.debug("Number of rows affected: %d", nrows_affected) return data
[docs]def fill_na_with_str(data: pd.DataFrame, colname: str= "genre", fill_string: str = "Missing") -> pd.DataFrame: """ Fill NA values with a string value. Args: data (:obj:`pandas.DataFrame`): DataFrame to clean colname (str, optional): Name of column to apply transformation to. Defaults to "genre". fill_string (str, optional): String to replace missing values with. Defaults to "Missing". Returns: Cleaned :obj:`pandas.DataFrame` """ # Do nothing if the specified column is not present if colname not in data.columns: logger.warning("%s not found in columns. Returning original data.", colname) return data nrows_affected = len(data.loc[pd.isna(data[colname])].index) data[colname] = data[colname].fillna(fill_string) logger.debug("Replaced missing values in %s with %s", colname, fill_string) logger.debug("Number of rows affected: %d", nrows_affected) return data