Skip to content

Framework Generator

Module for generating synthetic annotated datasets (frameworks) using instruction-tuned models hosted locally and filtering of low-quality examples via classifier agreement.

Attributes:

Name Type Description
model_name str

Name of the local model loaded in LM Studio and referenced in generation requests (default: "llama-3.2-3b-instruct").

api_url str

Full URL of the locally hosted LM Studio API endpoint that handles generation requests. This includes the server host, port, and path (default: "http://localhost:1234/v1/completions").

Methods:

Name Description
generate_framework

Simulates a dialogue and returns it as a pandas DataFrame.

filter_with_classifier

Filters the generated dataset using a small classifier trained on real labeled data.

Source code in src/educhateval/core.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class FrameworkGenerator:
    """
    Module for generating synthetic annotated datasets (frameworks) using instruction-tuned models hosted locally and filtering of low-quality examples via classifier agreement.

    Attributes:
        model_name (str): Name of the local model loaded in LM Studio and referenced in generation requests  (default: "llama-3.2-3b-instruct").
        api_url (str): Full URL of the locally hosted LM Studio API endpoint that handles generation requests. This includes the server host, port, and path (default: "http://localhost:1234/v1/completions").

    Methods:
        generate_framework(...): Simulates a dialogue and returns it as a pandas DataFrame.
        filter_with_classifier(...): Filters the generated dataset using a small classifier trained on real labeled data.
    """

    def __init__(
        self,
        model_name: str = "llama-3.2-3b-instruct",
        api_url: str = "http://localhost:1234/v1/completions",
    ):
        self.model_name = model_name
        self.api_url = api_url

    def generate_framework(
        self,
        prompt_path: str = None,
        prompt_dict_input: dict = None,
        num_samples: int = 500,
        json_out: str = None,
        csv_out: str = None,
        seed: int = 42,
        temperature: float = 0.85,
        top_p: float = 0.90,
    ) -> pd.DataFrame:
        """
        Generate a synthetic labeled dataset from prompts using a language model.
        Either `prompt_path` (path to .py file with `prompt_dict`) or `prompt_dict_input` must be provided.

        Parameters:
            prompt_path (str): Path to a Python file containing a prompt dictionary.
            prompt_dict_input (dict): Prompt dictionary directly provided.
            num_samples (int): Number of samples to generate per category.
            json_out (str): Optional path to save JSON output.
            csv_out (str): Optional path to save CSV output.
            seed (int): Random seed for reproducibility.
            temperature (float): Sampling temperature for generation.
            top_p (float): Top-p sampling parameter.

        Returns:
            pd.DataFrame: Cleaned, labeled synthetic dataset.
        """
        if not prompt_path and not prompt_dict_input:
            raise ValueError(
                "You must provide either a prompt_path or prompt_dict_input."
            )

        set_seed(seed)

        df = synthesize_dataset(
            prompt_dict=prompt_dict_input,
            prompt_path=prompt_path,
            model_name=self.model_name,
            num_samples=num_samples,
            api_url=self.api_url,
            json_out=json_out,
            csv_out=csv_out,
            temperature=temperature,
            top_p=top_p,
        )

        return df

    #### 2. function to quality check the dataset
    def filter_with_classifier(
        self,
        train_data: Union[str, pd.DataFrame],
        synth_data: Union[str, pd.DataFrame],
        text_column: str = "text",
        label_column: str = "category",
        split_ratio: float = 0.2,
        training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
        tuning: bool = False,
        tuning_params: dict = None,
        model_save_path: str = None,
        classifier_model_name: str = "distilbert-base-uncased",
        filtered_save_path: str = None,
    ) -> pd.DataFrame:
        """
        Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

        Parameters:
            train_data (str or pd.DataFrame): Path or DataFrame of small labeled training set.
            synth_data (str or pd.DataFrame): Path or DataFrame of generated synthetic dataset.
            text_column (str): Name of the text column.
            label_column (str): Name of the label column.
            split_ratio (float): Ratio for train/test split.
            training_params (list): Training hyperparameters.
            tuning (bool): Whether to perform hyperparameter tuning using Optuna.
            tuning_params (dict): Optional tuning grid.
            model_save_path (str): Optional path to save the classifier model.
            classifier_model_name (str): HF model ID for the classifier.
            filtered_save_path (str): Optional path to save filtered synthetic dataset.

        Returns:
            pd.DataFrame: Filtered synthetic dataset based on classifier agreement.
        """
        if isinstance(train_data, pd.DataFrame) and train_data.empty:
            raise ValueError("Provided training DataFrame is empty.")
        if isinstance(synth_data, pd.DataFrame) and synth_data.empty:
            raise ValueError("Provided synthetic DataFrame is empty.")

        tokenizer = load_tokenizer(classifier_model_name)

        dataset_dict, label2id = load_and_prepare_dataset(
            train_data, text_column, label_column, split_ratio
        )

        tokenized = tokenize_dataset(dataset_dict, tokenizer)

        model, trainer = train_model(
            tokenized,
            classifier_model_name,
            len(label2id),
            training_params,
            tuning,
            tuning_params,
        )

        trainer.evaluate()

        if model_save_path:
            save_model_and_tokenizer(model, tokenizer, model_save_path)

        df_filtered = filter_synthesized_data(
            synth_input=synth_data,
            model=model,
            tokenizer=tokenizer,
            label_column=label_column,
            save_path=filtered_save_path,
        )

        return df_filtered

generate_framework(prompt_path=None, prompt_dict_input=None, num_samples=500, json_out=None, csv_out=None, seed=42, temperature=0.85, top_p=0.9)

Generate a synthetic labeled dataset from prompts using a language model. Either prompt_path (path to .py file with prompt_dict) or prompt_dict_input must be provided.

Parameters:

Name Type Description Default
prompt_path str

Path to a Python file containing a prompt dictionary.

None
prompt_dict_input dict

Prompt dictionary directly provided.

None
num_samples int

Number of samples to generate per category.

500
json_out str

Optional path to save JSON output.

None
csv_out str

Optional path to save CSV output.

None
seed int

Random seed for reproducibility.

42
temperature float

Sampling temperature for generation.

0.85
top_p float

Top-p sampling parameter.

0.9

Returns:

Type Description
DataFrame

pd.DataFrame: Cleaned, labeled synthetic dataset.

Source code in src/educhateval/core.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def generate_framework(
    self,
    prompt_path: str = None,
    prompt_dict_input: dict = None,
    num_samples: int = 500,
    json_out: str = None,
    csv_out: str = None,
    seed: int = 42,
    temperature: float = 0.85,
    top_p: float = 0.90,
) -> pd.DataFrame:
    """
    Generate a synthetic labeled dataset from prompts using a language model.
    Either `prompt_path` (path to .py file with `prompt_dict`) or `prompt_dict_input` must be provided.

    Parameters:
        prompt_path (str): Path to a Python file containing a prompt dictionary.
        prompt_dict_input (dict): Prompt dictionary directly provided.
        num_samples (int): Number of samples to generate per category.
        json_out (str): Optional path to save JSON output.
        csv_out (str): Optional path to save CSV output.
        seed (int): Random seed for reproducibility.
        temperature (float): Sampling temperature for generation.
        top_p (float): Top-p sampling parameter.

    Returns:
        pd.DataFrame: Cleaned, labeled synthetic dataset.
    """
    if not prompt_path and not prompt_dict_input:
        raise ValueError(
            "You must provide either a prompt_path or prompt_dict_input."
        )

    set_seed(seed)

    df = synthesize_dataset(
        prompt_dict=prompt_dict_input,
        prompt_path=prompt_path,
        model_name=self.model_name,
        num_samples=num_samples,
        api_url=self.api_url,
        json_out=json_out,
        csv_out=csv_out,
        temperature=temperature,
        top_p=top_p,
    )

    return df

filter_with_classifier(train_data, synth_data, text_column='text', label_column='category', split_ratio=0.2, training_params=[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01], tuning=False, tuning_params=None, model_save_path=None, classifier_model_name='distilbert-base-uncased', filtered_save_path=None)

Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

Parameters:

Name Type Description Default
train_data str or DataFrame

Path or DataFrame of small labeled training set.

required
synth_data str or DataFrame

Path or DataFrame of generated synthetic dataset.

required
text_column str

Name of the text column.

'text'
label_column str

Name of the label column.

'category'
split_ratio float

Ratio for train/test split.

0.2
training_params list

Training hyperparameters.

[0.01, 'cross_entropy', 5e-05, 8, 8, 4, 0.01]
tuning bool

Whether to perform hyperparameter tuning using Optuna.

False
tuning_params dict

Optional tuning grid.

None
model_save_path str

Optional path to save the classifier model.

None
classifier_model_name str

HF model ID for the classifier.

'distilbert-base-uncased'
filtered_save_path str

Optional path to save filtered synthetic dataset.

None

Returns:

Type Description
DataFrame

pd.DataFrame: Filtered synthetic dataset based on classifier agreement.

Source code in src/educhateval/core.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def filter_with_classifier(
    self,
    train_data: Union[str, pd.DataFrame],
    synth_data: Union[str, pd.DataFrame],
    text_column: str = "text",
    label_column: str = "category",
    split_ratio: float = 0.2,
    training_params: list = [0.01, "cross_entropy", 5e-5, 8, 8, 4, 0.01],
    tuning: bool = False,
    tuning_params: dict = None,
    model_save_path: str = None,
    classifier_model_name: str = "distilbert-base-uncased",
    filtered_save_path: str = None,
) -> pd.DataFrame:
    """
    Train a small classifier on real labeled data and use it to filter the synthetic dataset by agreement.

    Parameters:
        train_data (str or pd.DataFrame): Path or DataFrame of small labeled training set.
        synth_data (str or pd.DataFrame): Path or DataFrame of generated synthetic dataset.
        text_column (str): Name of the text column.
        label_column (str): Name of the label column.
        split_ratio (float): Ratio for train/test split.
        training_params (list): Training hyperparameters.
        tuning (bool): Whether to perform hyperparameter tuning using Optuna.
        tuning_params (dict): Optional tuning grid.
        model_save_path (str): Optional path to save the classifier model.
        classifier_model_name (str): HF model ID for the classifier.
        filtered_save_path (str): Optional path to save filtered synthetic dataset.

    Returns:
        pd.DataFrame: Filtered synthetic dataset based on classifier agreement.
    """
    if isinstance(train_data, pd.DataFrame) and train_data.empty:
        raise ValueError("Provided training DataFrame is empty.")
    if isinstance(synth_data, pd.DataFrame) and synth_data.empty:
        raise ValueError("Provided synthetic DataFrame is empty.")

    tokenizer = load_tokenizer(classifier_model_name)

    dataset_dict, label2id = load_and_prepare_dataset(
        train_data, text_column, label_column, split_ratio
    )

    tokenized = tokenize_dataset(dataset_dict, tokenizer)

    model, trainer = train_model(
        tokenized,
        classifier_model_name,
        len(label2id),
        training_params,
        tuning,
        tuning_params,
    )

    trainer.evaluate()

    if model_save_path:
        save_model_and_tokenizer(model, tokenizer, model_save_path)

    df_filtered = filter_synthesized_data(
        synth_input=synth_data,
        model=model,
        tokenizer=tokenizer,
        label_column=label_column,
        save_path=filtered_save_path,
    )

    return df_filtered