Skip to content

Synthesizing Interactions

Module for generating multi-turn dialogues between a student and tutor agent using large language models.

This class wraps backend-specific model interfaces and orchestrates the simulation of conversations between two agents. It supports customizable educational modes and sampling behavior and ensures reproducibility via global seeding. Outputs are returned as structured pandas DataFrames.

Attributes:

Name Type Description
backend str

Backend to use for inference. Options are "hf" (Hugging Face) or "mlx" (MLX).

model_id str

The identifier of the model to use, e.g., "gpt2" (Hugging Face) or "Qwen2.5-7B-Instruct-1M-4bit" (MLX).

sampling_params Optional[dict]

Sampling hyperparameters such as temperature, top_p, or top_k.

Methods:

Name Description
simulate_dialogue

Simulates a dialogue and returns it as a pandas DataFrame.

Source code in src/educhateval/core.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
class DialogueSimulator:
    """
    Module for generating multi-turn dialogues between a student and tutor agent using large language models.

    This class wraps backend-specific model interfaces and orchestrates the simulation of conversations between two agents.
    It supports customizable educational modes and sampling behavior and ensures reproducibility via global seeding. Outputs are returned as structured pandas DataFrames.

    Attributes:
        backend (str): Backend to use for inference. Options are "hf" (Hugging Face) or "mlx" (MLX).
        model_id (str): The identifier of the model to use, e.g., "gpt2" (Hugging Face) or "Qwen2.5-7B-Instruct-1M-4bit" (MLX).
        sampling_params (Optional[dict]): Sampling hyperparameters such as temperature, top_p, or top_k.

    Methods:
        simulate_dialogue(...): Simulates a dialogue and returns it as a pandas DataFrame.
    """

    def __init__(
        self,
        backend: str = "mlx",
        model_id: str = "mlx-community/Qwen2.5-7B-Instruct-1M-4bit",
        sampling_params: Optional[dict] = None,
    ):
        if backend == "hf":
            self.model = ChatHF(
                model_id=model_id,
                sampling_params=sampling_params
                or {"temperature": 0.9, "top_p": 0.9, "top_k": 50},
            )
        elif backend == "mlx":
            self.model = ChatMLX(
                model_id=model_id,
                sampling_params=sampling_params
                or {"temp": 0.9, "top_p": 0.9, "top_k": 40},
            )
        else:
            raise ValueError("Unsupported backend")

        self.model.load()

    def simulate_dialogue(
        self,
        mode: str = "general_task_solving",
        turns: int = 5,
        seed_message_input: str = "Hi, I'm a student seeking assistance with my studies.",
        log_dir: Optional[Path] = None,
        save_csv_path: Optional[Path] = None,
        seed: int = 42,
        custom_prompt_file: Optional[Path] = None,
        system_prompts: Optional[dict] = None,
    ) -> pd.DataFrame:
        """
        Simulates a multi-turn dialogue using either built-in or custom prompts.

        Args:
            mode: Mode key to select prompt pair (student/tutor).
            turns: Number of back-and-forth turns to simulate.
            seed_message_input: First message from the student.
            log_dir: Directory to save raw log (optional).
            save_csv_path: Path to save structured DataFrame (optional).
            seed: Random seed for reproducibility.
            custom_prompt_file: Optional path to custom YAML defining prompt modes.
            system_prompts: Optional dictionary of custom dict of prompt modes.

        Returns:
            pd.DataFrame: Structured DataFrame of the conversation.
        """
        set_seed(seed)

        # Validate input source
        if system_prompts is not None and custom_prompt_file is not None:
            raise ValueError("Provide only one of `system_prompts` or `custom_prompt_file`, not both.")

        # Load prompts from file if needed
        if system_prompts is None:
            if custom_prompt_file:
                import yaml
                try:
                    with open(custom_prompt_file, "r") as f:
                        custom_prompts = yaml.safe_load(f)
                    print(f" Loaded custom prompts from: {custom_prompt_file}")
                except Exception as e:
                    raise ValueError(f"Failed to load YAML from {custom_prompt_file}: {e}")

                if "conversation_types" not in custom_prompts:
                    raise ValueError(f"Missing 'conversation_types' in custom prompt file: {custom_prompt_file}")

                if mode not in custom_prompts["conversation_types"]:
                    raise ValueError(f"Mode '{mode}' not found in custom prompt file: {custom_prompt_file}")

                system_prompts = custom_prompts["conversation_types"][mode]

            else:
                # Use built-in fallback
                print("Using default hardcoded prompts.")
                system_prompts = {
                    "student": "You are a student asking for help with a task.",
                    "tutor": "You are a helpful tutor guiding the student step by step.",
                }

        # Simulate conversation
        df = simulate_conversation(
            model=self.model,
            turns=turns,
            seed_message_input=seed_message_input,
            log_dir=log_dir,
            save_csv_path=save_csv_path,
            system_prompts=system_prompts,
            custom_prompt_file=None,  # already used, no need to pass again
            mode=mode,
        )

        print("\nFull dialogue stored in DataFrame. Use the returned object or view as `df`.")
        return df

simulate_dialogue(mode='general_task_solving', turns=5, seed_message_input="Hi, I'm a student seeking assistance with my studies.", log_dir=None, save_csv_path=None, seed=42, custom_prompt_file=None, system_prompts=None)

Simulates a multi-turn dialogue using either built-in or custom prompts.

Parameters:

Name Type Description Default
mode str

Mode key to select prompt pair (student/tutor).

'general_task_solving'
turns int

Number of back-and-forth turns to simulate.

5
seed_message_input str

First message from the student.

"Hi, I'm a student seeking assistance with my studies."
log_dir Optional[Path]

Directory to save raw log (optional).

None
save_csv_path Optional[Path]

Path to save structured DataFrame (optional).

None
seed int

Random seed for reproducibility.

42
custom_prompt_file Optional[Path]

Optional path to custom YAML defining prompt modes.

None
system_prompts Optional[dict]

Optional dictionary of custom dict of prompt modes.

None

Returns:

Type Description
DataFrame

pd.DataFrame: Structured DataFrame of the conversation.

Source code in src/educhateval/core.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
def simulate_dialogue(
    self,
    mode: str = "general_task_solving",
    turns: int = 5,
    seed_message_input: str = "Hi, I'm a student seeking assistance with my studies.",
    log_dir: Optional[Path] = None,
    save_csv_path: Optional[Path] = None,
    seed: int = 42,
    custom_prompt_file: Optional[Path] = None,
    system_prompts: Optional[dict] = None,
) -> pd.DataFrame:
    """
    Simulates a multi-turn dialogue using either built-in or custom prompts.

    Args:
        mode: Mode key to select prompt pair (student/tutor).
        turns: Number of back-and-forth turns to simulate.
        seed_message_input: First message from the student.
        log_dir: Directory to save raw log (optional).
        save_csv_path: Path to save structured DataFrame (optional).
        seed: Random seed for reproducibility.
        custom_prompt_file: Optional path to custom YAML defining prompt modes.
        system_prompts: Optional dictionary of custom dict of prompt modes.

    Returns:
        pd.DataFrame: Structured DataFrame of the conversation.
    """
    set_seed(seed)

    # Validate input source
    if system_prompts is not None and custom_prompt_file is not None:
        raise ValueError("Provide only one of `system_prompts` or `custom_prompt_file`, not both.")

    # Load prompts from file if needed
    if system_prompts is None:
        if custom_prompt_file:
            import yaml
            try:
                with open(custom_prompt_file, "r") as f:
                    custom_prompts = yaml.safe_load(f)
                print(f" Loaded custom prompts from: {custom_prompt_file}")
            except Exception as e:
                raise ValueError(f"Failed to load YAML from {custom_prompt_file}: {e}")

            if "conversation_types" not in custom_prompts:
                raise ValueError(f"Missing 'conversation_types' in custom prompt file: {custom_prompt_file}")

            if mode not in custom_prompts["conversation_types"]:
                raise ValueError(f"Mode '{mode}' not found in custom prompt file: {custom_prompt_file}")

            system_prompts = custom_prompts["conversation_types"][mode]

        else:
            # Use built-in fallback
            print("Using default hardcoded prompts.")
            system_prompts = {
                "student": "You are a student asking for help with a task.",
                "tutor": "You are a helpful tutor guiding the student step by step.",
            }

    # Simulate conversation
    df = simulate_conversation(
        model=self.model,
        turns=turns,
        seed_message_input=seed_message_input,
        log_dir=log_dir,
        save_csv_path=save_csv_path,
        system_prompts=system_prompts,
        custom_prompt_file=None,  # already used, no need to pass again
        mode=mode,
    )

    print("\nFull dialogue stored in DataFrame. Use the returned object or view as `df`.")
    return df