Framework Generator

Module for generating synthetic annotated datasets (frameworks) using instruction-tuned models hosted locally and filtering of low-quality examples via classifier agreement.

Attributes:

Name	Type	Description
`model_name`	`str`	Name of the local model loaded in LM Studio and referenced in generation requests (default: "llama-3.2-3b-instruct").
`api_url`	`str`	Full URL of the locally hosted LM Studio API endpoint that handles generation requests. This includes the server host, port, and path (default: "http://localhost:1234/v1/completions").

Methods:

Name	Description
`generate_framework`	Simulates a dialogue and returns it as a pandas DataFrame.
`filter_with_classifier`	Filters the generated dataset using a small classifier trained on real labeled data.

Source code in src/educhateval/core.py

class FrameworkGenerator:
    """
    Module for generating synthetic annotated datasets (frameworks) using instruction-tuned models hosted locally and filtering of low-quality examples via classifier agreement.

    Attributes:
        model_name (str): Name of the local model loaded in LM Studio and referenced in generation requests  (default: "llama-3.2-3b-instruct").
        api_url (str): Full URL of the locally hosted LM Studio API endpoint that handles generation requests. This includes the server host, port, and path (default: "http://localhost:1234/v1/completions").

    Methods:
        generate_framework(...): Simulates a dialogue and returns it as a pandas DataFrame.
        filter_with_classifier(...): Filters the generated dataset using a small classifier trained on real labeled data.
    """

    def __init__(
        self,
        model_name: str = "llama-3.2-3b-instruct",
        api_url: str = "http://localhost:1234/v1/completions",
    ):
        self.model_name = model_name
        self.api_url = api_url

    def generate_framework(
        self,
        prompt_path: str = None,
        prompt_dict_input: dict = None,
        num_samples: int = 500,
        json_out: str = None,
        csv_out: str = None,
        seed: int = 42,
        temperature: float = 0.85,
        top_p: float = 0.90,
    ) -> pd.DataFrame:
        """
        Generate a synthetic labeled dataset from prompts using a language model.
        Either `prompt_path` (path to .py file with `prompt_dict`) or `prompt_dict_input` must be provided.

        Parameters:
            prompt_path (str): Path to a Python file containing a prompt dictionary.
            prompt_dict_input (dict): Prompt dictionary directly provided.
            num_samples (int): Number of samples to generate per category.
            json_out (str): Optional path to save JSON output.
            csv_out (str): Optional path to save CSV output.
            seed (int): Random seed for reproducibility.
            temperature (float): Sampling temperature for generation.
            top_p (float): Top-p sampling parameter.

        Returns:
            pd.DataFrame: Cleaned, labeled synthetic dataset.
        """
        if not prompt_path and not prompt_dict_input:
            raise ValueError(
                "You must provide either a prompt_path or prompt_dict_input."
            )

        set_seed(seed)

        df = synthesize_dataset(
            prompt_dict=prompt_dict_input,
            prompt_path=prompt_path,
            model_name=self.model_name,
            num_samples=num_samples,
            api_url=self.api_url,
            json_out=json_out,
            csv_out=csv_out,
            temperature=temperature,
            top_p=top_p,
        )

        return df

    #### 2. function to quality check the dataset
    def filter_with_classifier(
        self,
        train_data: Union[str, pd.DataFrame],
        synth_data: Union[str, pd.DataFrame],
        text_column: str = "text",
        label_column: str = "category",
        split_ratio: float = 0.2,
        training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
        tuning: bool = False,
        tuning_params: dict = None,
        model_save_path: str = None,
        classifier_model_name: str = "distilbert-base-uncased",
        filtered_save_path: str = None,
    ) -> pd.DataFrame:
        """
        Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

        Parameters:
            train_data (str or pd.DataFrame): Path or DataFrame of small labeled training set.
            synth_data (str or pd.DataFrame): Path or DataFrame of generated synthetic dataset.
            text_column (str): Name of the text column.
            label_column (str): Name of the label column.
            split_ratio (float): Ratio for train/test split.
            training_params (list): Training hyperparameters.
            tuning (bool): Whether to perform hyperparameter tuning using Optuna.
            tuning_params (dict): Optional tuning grid.
            model_save_path (str): Optional path to save the classifier model.
            classifier_model_name (str): HF model ID for the classifier.
            filtered_save_path (str): Optional path to save filtered synthetic dataset.

        Returns:
            pd.DataFrame: Filtered synthetic dataset based on classifier agreement.
        """
        if isinstance(train_data, pd.DataFrame) and train_data.empty:
            raise ValueError("Provided training DataFrame is empty.")
        if isinstance(synth_data, pd.DataFrame) and synth_data.empty:
            raise ValueError("Provided synthetic DataFrame is empty.")

        tokenizer = load_tokenizer(classifier_model_name)

        dataset_dict, label2id = load_and_prepare_dataset(
            train_data, text_column, label_column, split_ratio
        )

        tokenized = tokenize_dataset(dataset_dict, tokenizer)

        model, trainer = train_model(
            tokenized,
            classifier_model_name,
            len(label2id),
            training_params,
            tuning,
            tuning_params,
        )

        trainer.evaluate()

        if model_save_path:
            save_model_and_tokenizer(model, tokenizer, model_save_path)

        df_filtered = filter_synthesized_data(
            synth_input=synth_data,
            model=model,
            tokenizer=tokenizer,
            label_column=label_column,
            save_path=filtered_save_path,
        )

        return df_filtered

`generate_framework(prompt_path=None, prompt_dict_input=None, num_samples=500, json_out=None, csv_out=None, seed=42, temperature=0.85, top_p=0.9)`

Generate a synthetic labeled dataset from prompts using a language model. Either prompt_path (path to .py file with prompt_dict) or prompt_dict_input must be provided.

Parameters:

Name	Type	Description	Default
`prompt_path`	`str`	Path to a Python file containing a prompt dictionary.	`None`
`prompt_dict_input`	`dict`	Prompt dictionary directly provided.	`None`
`num_samples`	`int`	Number of samples to generate per category.	`500`
`json_out`	`str`	Optional path to save JSON output.	`None`
`csv_out`	`str`	Optional path to save CSV output.	`None`
`seed`	`int`	Random seed for reproducibility.	`42`
`temperature`	`float`	Sampling temperature for generation.	`0.85`
`top_p`	`float`	Top-p sampling parameter.	`0.9`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Cleaned, labeled synthetic dataset.

Source code in src/educhateval/core.py

def generate_framework(
    self,
    prompt_path: str = None,
    prompt_dict_input: dict = None,
    num_samples: int = 500,
    json_out: str = None,
    csv_out: str = None,
    seed: int = 42,
    temperature: float = 0.85,
    top_p: float = 0.90,
) -> pd.DataFrame:
    """
    Generate a synthetic labeled dataset from prompts using a language model.
    Either `prompt_path` (path to .py file with `prompt_dict`) or `prompt_dict_input` must be provided.

    Parameters:
        prompt_path (str): Path to a Python file containing a prompt dictionary.
        prompt_dict_input (dict): Prompt dictionary directly provided.
        num_samples (int): Number of samples to generate per category.
        json_out (str): Optional path to save JSON output.
        csv_out (str): Optional path to save CSV output.
        seed (int): Random seed for reproducibility.
        temperature (float): Sampling temperature for generation.
        top_p (float): Top-p sampling parameter.

    Returns:
        pd.DataFrame: Cleaned, labeled synthetic dataset.
    """
    if not prompt_path and not prompt_dict_input:
        raise ValueError(
            "You must provide either a prompt_path or prompt_dict_input."
        )

    set_seed(seed)

    df = synthesize_dataset(
        prompt_dict=prompt_dict_input,
        prompt_path=prompt_path,
        model_name=self.model_name,
        num_samples=num_samples,
        api_url=self.api_url,
        json_out=json_out,
        csv_out=csv_out,
        temperature=temperature,
        top_p=top_p,
    )

    return df

`filter_with_classifier(train_data, synth_data, text_column='text', label_column='category', split_ratio=0.2, training_params=[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01], tuning=False, tuning_params=None, model_save_path=None, classifier_model_name='distilbert-base-uncased', filtered_save_path=None)`

Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

Parameters:

Name	Type	Description	Default
`train_data`	`str or DataFrame`	Path or DataFrame of small labeled training set.	required
`synth_data`	`str or DataFrame`	Path or DataFrame of generated synthetic dataset.	required
`text_column`	`str`	Name of the text column.	`'text'`
`label_column`	`str`	Name of the label column.	`'category'`
`split_ratio`	`float`	Ratio for train/test split.	`0.2`
`training_params`	`list`	Training hyperparameters.	`[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01]`
`tuning`	`bool`	Whether to perform hyperparameter tuning using Optuna.	`False`
`tuning_params`	`dict`	Optional tuning grid.	`None`
`model_save_path`	`str`	Optional path to save the classifier model.	`None`
`classifier_model_name`	`str`	HF model ID for the classifier.	`'distilbert-base-uncased'`
`filtered_save_path`	`str`	Optional path to save filtered synthetic dataset.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Filtered synthetic dataset based on classifier agreement.

Source code in src/educhateval/core.py

def filter_with_classifier(
    self,
    train_data: Union[str, pd.DataFrame],
    synth_data: Union[str, pd.DataFrame],
    text_column: str = "text",
    label_column: str = "category",
    split_ratio: float = 0.2,
    training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
    tuning: bool = False,
    tuning_params: dict = None,
    model_save_path: str = None,
    classifier_model_name: str = "distilbert-base-uncased",
    filtered_save_path: str = None,
) -> pd.DataFrame:
    """
    Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

    Parameters:
        train_data (str or pd.DataFrame): Path or DataFrame of small labeled training set.
        synth_data (str or pd.DataFrame): Path or DataFrame of generated synthetic dataset.
        text_column (str): Name of the text column.
        label_column (str): Name of the label column.
        split_ratio (float): Ratio for train/test split.
        training_params (list): Training hyperparameters.
        tuning (bool): Whether to perform hyperparameter tuning using Optuna.
        tuning_params (dict): Optional tuning grid.
        model_save_path (str): Optional path to save the classifier model.
        classifier_model_name (str): HF model ID for the classifier.
        filtered_save_path (str): Optional path to save filtered synthetic dataset.

    Returns:
        pd.DataFrame: Filtered synthetic dataset based on classifier agreement.
    """
    if isinstance(train_data, pd.DataFrame) and train_data.empty:
        raise ValueError("Provided training DataFrame is empty.")
    if isinstance(synth_data, pd.DataFrame) and synth_data.empty:
        raise ValueError("Provided synthetic DataFrame is empty.")

    tokenizer = load_tokenizer(classifier_model_name)

    dataset_dict, label2id = load_and_prepare_dataset(
        train_data, text_column, label_column, split_ratio
    )

    tokenized = tokenize_dataset(dataset_dict, tokenizer)

    model, trainer = train_model(
        tokenized,
        classifier_model_name,
        len(label2id),
        training_params,
        tuning,
        tuning_params,
    )

    trainer.evaluate()

    if model_save_path:
        save_model_and_tokenizer(model, tokenizer, model_save_path)

    df_filtered = filter_synthesized_data(
        synth_input=synth_data,
        model=model,
        tokenizer=tokenizer,
        label_column=label_column,
        save_path=filtered_save_path,
    )

    return df_filtered