Predicting Labels

Module for training and applying a text classification model.

This class streamlines the process of fine-tuning a transformer-based classifier on labeled data and applying the trained model to annotate new, unlabeled datasets. Supports both single and multi-column predictions and includes optional model saving and evaluation output.

Attributes:

Name	Type	Description
`model_name`	`str`	Name of the pretrained Hugging Face model to fine-tune (default: "distilbert-base-uncased").

Methods:

Name	Description
`run_pipeline`	Trains the classifier and returns a DataFrame with predicted labels and confidence scores.

Source code in src/educhateval/core.py

class PredictLabels:
    """
    Module for training and applying a text classification model.

    This class streamlines the process of fine-tuning a transformer-based classifier on labeled data
    and applying the trained model to annotate new, unlabeled datasets. Supports both single and multi-column
    predictions and includes optional model saving and evaluation output.

    Attributes:
        model_name (str): Name of the pretrained Hugging Face model to fine-tune (default: "distilbert-base-uncased").

    Methods:
        run_pipeline(...): Trains the classifier and returns a DataFrame with predicted labels and confidence scores.
    """

    def __init__(self, model_name: str = "distilbert-base-uncased"):
        self.model_name = model_name
        self.tokenizer = load_tokenizer(model_name)

    def run_pipeline(
        self,
        train_data: Union[str, pd.DataFrame],
        new_data: Union[str, pd.DataFrame],
        # columns in the training data
        text_column: str = "text",
        label_column: str = "category",
        # columns to classify in the new data
        columns_to_classify: Optional[Union[str, List[str]]] = None,
        split_ratio: float = 0.2,
        training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
        tuning: bool = False,
        tuning_params: Optional[dict] = None,
        model_save_path: Optional[str] = None,
        prediction_save_path: Optional[str] = None,
        seed: int = 42,
    ) -> pd.DataFrame:
        """
        This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based
        classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter
        tuning, and saving of both the trained model and prediction outputs.

        Parameters:
            train_data (Union[str, pd.DataFrame]): Labeled dataset for training. Can be a DataFrame or a CSV file path.
            new_data (Union[str, pd.DataFrame]): Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.
            text_column (str): Column in the training data containing the input text. Defaults to "text".
            label_column (str): Column in the training data containing the target labels. Defaults to "category".
            columns_to_classify (Optional[Union[str, List[str]]]): Column(s) in `new_data` to predict labels for. Defaults to `text_column`.
            split_ratio (float): Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.
            training_params (list): List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size,
                                num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].
            tuning (bool): Whether to perform hyperparameter tuning. Defaults to False.
            tuning_params (Optional[dict]): Dictionary of tuning settings if `tuning` is True. Defaults to None.
            model_save_path (Optional[str]): Optional path to save the trained model and tokenizer. Defaults to None.
            prediction_save_path (Optional[str]): Optional path to save annotated predictions as a CSV. Defaults to None.
            seed (int): Random seed for reproducibility. Defaults to 42.

        Returns:
            pd.DataFrame: A DataFrame containing the original `new_data` with added columns for predicted labels and confidence scores.
        """

        # Validate training data input
        if not isinstance(train_data, (pd.DataFrame, str)):
            raise ValueError(
                "Please provide data training data. This must be a pandas DataFrame or a path to a CSV file."
            )

        if not isinstance(new_data, (pd.DataFrame, str)):
            raise ValueError(
                "Please provide data to be labeled. This must be a pandas DataFrame or a path to a CSV file."
            )

        # Validate training parameters
        if not isinstance(training_params, list) or len(training_params) < 7:
            raise ValueError(
                "training_params must be a list of at least 7 hyperparameter values."
            )

        if not isinstance(split_ratio, float) or not (0.0 < split_ratio < 1.0):
            raise ValueError("split_ratio must be a float between 0 and 1.")

        # Validate column names
        if not isinstance(text_column, str):
            raise ValueError("text_column must be a string.")
        if not isinstance(label_column, str):
            raise ValueError("label_column must be a string.")

        # Validate columns_to_classify
        if columns_to_classify is not None:
            if not isinstance(columns_to_classify, (str, list)):
                raise ValueError(
                    "columns_to_classify must be a string or a list of strings."
                )
            if isinstance(columns_to_classify, list) and not all(
                isinstance(col, str) for col in columns_to_classify
            ):
                raise ValueError("All entries in columns_to_classify must be strings.")

        set_seed(seed)

        dataset_dict, label2id = load_and_prepare_dataset(
            train_data, text_column, label_column, split_ratio
        )
        tokenized = tokenize_dataset(dataset_dict, self.tokenizer)

        model, trainer = train_model(
            tokenized,
            self.model_name,
            len(label2id),
            training_params,
            tuning,
            tuning_params,
        )

        if model_save_path:
            save_model_and_tokenizer(model, self.tokenizer, model_save_path)

        # Default to using the training text_column if no specific columns_to_classify provided
        if columns_to_classify is None:
            columns_to_classify = text_column

        df_annotated = predict_annotated_dataset(
            new_data=new_data,
            model=model,
            text_columns=columns_to_classify,
            tokenizer=self.tokenizer,
            label2id=label2id,
            save_path=prediction_save_path,
        )

        return df_annotated

`run_pipeline(train_data, new_data, text_column='text', label_column='category', columns_to_classify=None, split_ratio=0.2, training_params=[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01], tuning=False, tuning_params=None, model_save_path=None, prediction_save_path=None, seed=42)`

This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter tuning, and saving of both the trained model and prediction outputs.

Parameters:

Name	Type	Description	Default
`train_data`	`Union[str, DataFrame]`	Labeled dataset for training. Can be a DataFrame or a CSV file path.	required
`new_data`	`Union[str, DataFrame]`	Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.	required
`text_column`	`str`	Column in the training data containing the input text. Defaults to "text".	`'text'`
`label_column`	`str`	Column in the training data containing the target labels. Defaults to "category".	`'category'`
`columns_to_classify`	`Optional[Union[str, List[str]]]`	Column(s) in `new_data` to predict labels for. Defaults to `text_column`.	`None`
`split_ratio`	`float`	Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.	`0.2`
`training_params`	`list`	List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size, num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].	`[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01]`
`tuning`	`bool`	Whether to perform hyperparameter tuning. Defaults to False.	`False`
`tuning_params`	`Optional[dict]`	Dictionary of tuning settings if `tuning` is True. Defaults to None.	`None`
`model_save_path`	`Optional[str]`	Optional path to save the trained model and tokenizer. Defaults to None.	`None`
`prediction_save_path`	`Optional[str]`	Optional path to save annotated predictions as a CSV. Defaults to None.	`None`
`seed`	`int`	Random seed for reproducibility. Defaults to 42.	`42`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame containing the original `new_data` with added columns for predicted labels and confidence scores.

Source code in src/educhateval/core.py

def run_pipeline(
    self,
    train_data: Union[str, pd.DataFrame],
    new_data: Union[str, pd.DataFrame],
    # columns in the training data
    text_column: str = "text",
    label_column: str = "category",
    # columns to classify in the new data
    columns_to_classify: Optional[Union[str, List[str]]] = None,
    split_ratio: float = 0.2,
    training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
    tuning: bool = False,
    tuning_params: Optional[dict] = None,
    model_save_path: Optional[str] = None,
    prediction_save_path: Optional[str] = None,
    seed: int = 42,
) -> pd.DataFrame:
    """
    This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based
    classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter
    tuning, and saving of both the trained model and prediction outputs.

    Parameters:
        train_data (Union[str, pd.DataFrame]): Labeled dataset for training. Can be a DataFrame or a CSV file path.
        new_data (Union[str, pd.DataFrame]): Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.
        text_column (str): Column in the training data containing the input text. Defaults to "text".
        label_column (str): Column in the training data containing the target labels. Defaults to "category".
        columns_to_classify (Optional[Union[str, List[str]]]): Column(s) in `new_data` to predict labels for. Defaults to `text_column`.
        split_ratio (float): Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.
        training_params (list): List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size,
                            num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].
        tuning (bool): Whether to perform hyperparameter tuning. Defaults to False.
        tuning_params (Optional[dict]): Dictionary of tuning settings if `tuning` is True. Defaults to None.
        model_save_path (Optional[str]): Optional path to save the trained model and tokenizer. Defaults to None.
        prediction_save_path (Optional[str]): Optional path to save annotated predictions as a CSV. Defaults to None.
        seed (int): Random seed for reproducibility. Defaults to 42.

    Returns:
        pd.DataFrame: A DataFrame containing the original `new_data` with added columns for predicted labels and confidence scores.
    """

    # Validate training data input
    if not isinstance(train_data, (pd.DataFrame, str)):
        raise ValueError(
            "Please provide data training data. This must be a pandas DataFrame or a path to a CSV file."
        )

    if not isinstance(new_data, (pd.DataFrame, str)):
        raise ValueError(
            "Please provide data to be labeled. This must be a pandas DataFrame or a path to a CSV file."
        )

    # Validate training parameters
    if not isinstance(training_params, list) or len(training_params) < 7:
        raise ValueError(
            "training_params must be a list of at least 7 hyperparameter values."
        )

    if not isinstance(split_ratio, float) or not (0.0 < split_ratio < 1.0):
        raise ValueError("split_ratio must be a float between 0 and 1.")

    # Validate column names
    if not isinstance(text_column, str):
        raise ValueError("text_column must be a string.")
    if not isinstance(label_column, str):
        raise ValueError("label_column must be a string.")

    # Validate columns_to_classify
    if columns_to_classify is not None:
        if not isinstance(columns_to_classify, (str, list)):
            raise ValueError(
                "columns_to_classify must be a string or a list of strings."
            )
        if isinstance(columns_to_classify, list) and not all(
            isinstance(col, str) for col in columns_to_classify
        ):
            raise ValueError("All entries in columns_to_classify must be strings.")

    set_seed(seed)

    dataset_dict, label2id = load_and_prepare_dataset(
        train_data, text_column, label_column, split_ratio
    )
    tokenized = tokenize_dataset(dataset_dict, self.tokenizer)

    model, trainer = train_model(
        tokenized,
        self.model_name,
        len(label2id),
        training_params,
        tuning,
        tuning_params,
    )

    if model_save_path:
        save_model_and_tokenizer(model, self.tokenizer, model_save_path)

    # Default to using the training text_column if no specific columns_to_classify provided
    if columns_to_classify is None:
        columns_to_classify = text_column

    df_annotated = predict_annotated_dataset(
        new_data=new_data,
        model=model,
        text_columns=columns_to_classify,
        tokenizer=self.tokenizer,
        label2id=label2id,
        save_path=prediction_save_path,
    )

    return df_annotated