Skip to content

Predicting Labels

Module for training and applying a text classification model.

This class streamlines the process of fine-tuning a transformer-based classifier on labeled data and applying the trained model to annotate new, unlabeled datasets. Supports both single and multi-column predictions and includes optional model saving and evaluation output.

Attributes:

Name Type Description
model_name str

Name of the pretrained Hugging Face model to fine-tune (default: "distilbert-base-uncased").

Methods:

Name Description
run_pipeline

Trains the classifier and returns a DataFrame with predicted labels and confidence scores.

Source code in src/educhateval/core.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
class PredictLabels:
    """
    Module for training and applying a text classification model.

    This class streamlines the process of fine-tuning a transformer-based classifier on labeled data
    and applying the trained model to annotate new, unlabeled datasets. Supports both single and multi-column
    predictions and includes optional model saving and evaluation output.

    Attributes:
        model_name (str): Name of the pretrained Hugging Face model to fine-tune (default: "distilbert-base-uncased").

    Methods:
        run_pipeline(...): Trains the classifier and returns a DataFrame with predicted labels and confidence scores.
    """

    def __init__(self, model_name: str = "distilbert-base-uncased"):
        self.model_name = model_name
        self.tokenizer = load_tokenizer(model_name)

    def run_pipeline(
        self,
        train_data: Union[str, pd.DataFrame],
        new_data: Union[str, pd.DataFrame],
        # columns in the training data
        text_column: str = "text",
        label_column: str = "category",
        # columns to classify in the new data
        columns_to_classify: Optional[Union[str, List[str]]] = None,
        split_ratio: float = 0.2,
        training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
        tuning: bool = False,
        tuning_params: Optional[dict] = None,
        model_save_path: Optional[str] = None,
        prediction_save_path: Optional[str] = None,
        seed: int = 42,
    ) -> pd.DataFrame:
        """
        This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based
        classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter
        tuning, and saving of both the trained model and prediction outputs.

        Parameters:
            train_data (Union[str, pd.DataFrame]): Labeled dataset for training. Can be a DataFrame or a CSV file path.
            new_data (Union[str, pd.DataFrame]): Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.
            text_column (str): Column in the training data containing the input text. Defaults to "text".
            label_column (str): Column in the training data containing the target labels. Defaults to "category".
            columns_to_classify (Optional[Union[str, List[str]]]): Column(s) in `new_data` to predict labels for. Defaults to `text_column`.
            split_ratio (float): Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.
            training_params (list): List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size,
                                num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].
            tuning (bool): Whether to perform hyperparameter tuning. Defaults to False.
            tuning_params (Optional[dict]): Dictionary of tuning settings if `tuning` is True. Defaults to None.
            model_save_path (Optional[str]): Optional path to save the trained model and tokenizer. Defaults to None.
            prediction_save_path (Optional[str]): Optional path to save annotated predictions as a CSV. Defaults to None.
            seed (int): Random seed for reproducibility. Defaults to 42.

        Returns:
            pd.DataFrame: A DataFrame containing the original `new_data` with added columns for predicted labels and confidence scores.
        """

        # Validate training data input
        if not isinstance(train_data, (pd.DataFrame, str)):
            raise ValueError(
                "Please provide data training data. This must be a pandas DataFrame or a path to a CSV file."
            )

        if not isinstance(new_data, (pd.DataFrame, str)):
            raise ValueError(
                "Please provide data to be labeled. This must be a pandas DataFrame or a path to a CSV file."
            )

        # Validate training parameters
        if not isinstance(training_params, list) or len(training_params) < 7:
            raise ValueError(
                "training_params must be a list of at least 7 hyperparameter values."
            )

        if not isinstance(split_ratio, float) or not (0.0 < split_ratio < 1.0):
            raise ValueError("split_ratio must be a float between 0 and 1.")

        # Validate column names
        if not isinstance(text_column, str):
            raise ValueError("text_column must be a string.")
        if not isinstance(label_column, str):
            raise ValueError("label_column must be a string.")

        # Validate columns_to_classify
        if columns_to_classify is not None:
            if not isinstance(columns_to_classify, (str, list)):
                raise ValueError(
                    "columns_to_classify must be a string or a list of strings."
                )
            if isinstance(columns_to_classify, list) and not all(
                isinstance(col, str) for col in columns_to_classify
            ):
                raise ValueError("All entries in columns_to_classify must be strings.")

        set_seed(seed)

        dataset_dict, label2id = load_and_prepare_dataset(
            train_data, text_column, label_column, split_ratio
        )
        tokenized = tokenize_dataset(dataset_dict, self.tokenizer)

        model, trainer = train_model(
            tokenized,
            self.model_name,
            len(label2id),
            training_params,
            tuning,
            tuning_params,
        )

        if model_save_path:
            save_model_and_tokenizer(model, self.tokenizer, model_save_path)

        # Default to using the training text_column if no specific columns_to_classify provided
        if columns_to_classify is None:
            columns_to_classify = text_column

        df_annotated = predict_annotated_dataset(
            new_data=new_data,
            model=model,
            text_columns=columns_to_classify,
            tokenizer=self.tokenizer,
            label2id=label2id,
            save_path=prediction_save_path,
        )

        return df_annotated

run_pipeline(train_data, new_data, text_column='text', label_column='category', columns_to_classify=None, split_ratio=0.2, training_params=[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01], tuning=False, tuning_params=None, model_save_path=None, prediction_save_path=None, seed=42)

This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter tuning, and saving of both the trained model and prediction outputs.

Parameters:

Name Type Description Default
train_data Union[str, DataFrame]

Labeled dataset for training. Can be a DataFrame or a CSV file path.

required
new_data Union[str, DataFrame]

Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.

required
text_column str

Column in the training data containing the input text. Defaults to "text".

'text'
label_column str

Column in the training data containing the target labels. Defaults to "category".

'category'
columns_to_classify Optional[Union[str, List[str]]]

Column(s) in new_data to predict labels for. Defaults to text_column.

None
split_ratio float

Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.

0.2
training_params list

List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size, num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].

[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01]
tuning bool

Whether to perform hyperparameter tuning. Defaults to False.

False
tuning_params Optional[dict]

Dictionary of tuning settings if tuning is True. Defaults to None.

None
model_save_path Optional[str]

Optional path to save the trained model and tokenizer. Defaults to None.

None
prediction_save_path Optional[str]

Optional path to save annotated predictions as a CSV. Defaults to None.

None
seed int

Random seed for reproducibility. Defaults to 42.

42

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the original new_data with added columns for predicted labels and confidence scores.

Source code in src/educhateval/core.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
def run_pipeline(
    self,
    train_data: Union[str, pd.DataFrame],
    new_data: Union[str, pd.DataFrame],
    # columns in the training data
    text_column: str = "text",
    label_column: str = "category",
    # columns to classify in the new data
    columns_to_classify: Optional[Union[str, List[str]]] = None,
    split_ratio: float = 0.2,
    training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
    tuning: bool = False,
    tuning_params: Optional[dict] = None,
    model_save_path: Optional[str] = None,
    prediction_save_path: Optional[str] = None,
    seed: int = 42,
) -> pd.DataFrame:
    """
    This function handles the full pipeline of loading data, preparing datasets, tokenizing inputs, training a transformer-based
    classifier, and applying it to specified text columns in new data. It supports custom hyperparameters, optional hyperparameter
    tuning, and saving of both the trained model and prediction outputs.

    Parameters:
        train_data (Union[str, pd.DataFrame]): Labeled dataset for training. Can be a DataFrame or a CSV file path.
        new_data (Union[str, pd.DataFrame]): Dataset to annotate with predicted labels. Can be a DataFrame or a CSV file path.
        text_column (str): Column in the training data containing the input text. Defaults to "text".
        label_column (str): Column in the training data containing the target labels. Defaults to "category".
        columns_to_classify (Optional[Union[str, List[str]]]): Column(s) in `new_data` to predict labels for. Defaults to `text_column`.
        split_ratio (float): Ratio of data to use for validation. Must be between 0 and 1. Defaults to 0.2.
        training_params (list): List of 7 training hyperparameters: [weight_decay, loss_fn, learning_rate, batch_size,
                            num_epochs, warmup_steps, gradient_accumulation]. Defaults to [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01].
        tuning (bool): Whether to perform hyperparameter tuning. Defaults to False.
        tuning_params (Optional[dict]): Dictionary of tuning settings if `tuning` is True. Defaults to None.
        model_save_path (Optional[str]): Optional path to save the trained model and tokenizer. Defaults to None.
        prediction_save_path (Optional[str]): Optional path to save annotated predictions as a CSV. Defaults to None.
        seed (int): Random seed for reproducibility. Defaults to 42.

    Returns:
        pd.DataFrame: A DataFrame containing the original `new_data` with added columns for predicted labels and confidence scores.
    """

    # Validate training data input
    if not isinstance(train_data, (pd.DataFrame, str)):
        raise ValueError(
            "Please provide data training data. This must be a pandas DataFrame or a path to a CSV file."
        )

    if not isinstance(new_data, (pd.DataFrame, str)):
        raise ValueError(
            "Please provide data to be labeled. This must be a pandas DataFrame or a path to a CSV file."
        )

    # Validate training parameters
    if not isinstance(training_params, list) or len(training_params) < 7:
        raise ValueError(
            "training_params must be a list of at least 7 hyperparameter values."
        )

    if not isinstance(split_ratio, float) or not (0.0 < split_ratio < 1.0):
        raise ValueError("split_ratio must be a float between 0 and 1.")

    # Validate column names
    if not isinstance(text_column, str):
        raise ValueError("text_column must be a string.")
    if not isinstance(label_column, str):
        raise ValueError("label_column must be a string.")

    # Validate columns_to_classify
    if columns_to_classify is not None:
        if not isinstance(columns_to_classify, (str, list)):
            raise ValueError(
                "columns_to_classify must be a string or a list of strings."
            )
        if isinstance(columns_to_classify, list) and not all(
            isinstance(col, str) for col in columns_to_classify
        ):
            raise ValueError("All entries in columns_to_classify must be strings.")

    set_seed(seed)

    dataset_dict, label2id = load_and_prepare_dataset(
        train_data, text_column, label_column, split_ratio
    )
    tokenized = tokenize_dataset(dataset_dict, self.tokenizer)

    model, trainer = train_model(
        tokenized,
        self.model_name,
        len(label2id),
        training_params,
        tuning,
        tuning_params,
    )

    if model_save_path:
        save_model_and_tokenizer(model, self.tokenizer, model_save_path)

    # Default to using the training text_column if no specific columns_to_classify provided
    if columns_to_classify is None:
        columns_to_classify = text_column

    df_annotated = predict_annotated_dataset(
        new_data=new_data,
        model=model,
        text_columns=columns_to_classify,
        tokenizer=self.tokenizer,
        label2id=label2id,
        save_path=prediction_save_path,
    )

    return df_annotated