Skip to content

Visualizer

Module for generating four different visualizations to analysis the interactions by. Every visualization besides the 4. Interaction Distribution plot can be created for either the student, the tutor or both. The Interaction Distribution plot requires both student and tutor data to visualize interactions.

1. Barchart of Predicted Classes

Source code in src/educhateval/descriptive_results/display_results.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def plot_category_bars(
    df,
    student_col=None,
    tutor_col=None,
    use_percent=True,
    palette="icefire",
    title="Predicted Classes",
):
    if not student_col and not tutor_col:
        raise ValueError("You must provide at least one of student_col or tutor_col.")

    long_dfs = []
    if student_col:
        temp = df[[student_col]].copy()
        temp["source"] = "Student"
        temp.rename(columns={student_col: "predicted_label"}, inplace=True)
        long_dfs.append(temp)
    if tutor_col:
        temp = df[[tutor_col]].copy()
        temp["source"] = "Tutor"
        temp.rename(columns={tutor_col: "predicted_label"}, inplace=True)
        long_dfs.append(temp)

    long_df = pd.concat(long_dfs, ignore_index=True)

    all_labels = sorted(long_df["predicted_label"].dropna().unique())
    long_df["predicted_label"] = pd.Categorical(
        long_df["predicted_label"], categories=all_labels, ordered=True
    )

    count_df = (
        long_df.groupby(["source", "predicted_label"], observed=True)
        .size()
        .reset_index(name="count")
    )

    if use_percent:
        total_per_source = count_df.groupby("source", observed=True)["count"].transform(
            "sum"
        )
        count_df["value"] = (count_df["count"] / total_per_source) * 100
        y_label = "Occurrences (%)"
        fmt = lambda val: f"{val:.0f}%"
    else:
        count_df["value"] = count_df["count"]
        y_label = "Number of Occurrences"
        fmt = lambda val: f"{int(val)}"

    sns.set_style("whitegrid")
    plt.figure(figsize=(10, 6))

    ax = sns.barplot(
        data=count_df,
        x="predicted_label",
        y="value",
        hue="source",
        palette=palette,
        order=all_labels,
    )

    ax.set_xlabel("Predicted Category")
    ax.set_ylabel(y_label)
    ax.set_title(title, fontsize=15, fontweight="bold")

    if use_percent:
        ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: f"{y:.0f}%"))

    for container in ax.containers:
        for bar in container:
            height = bar.get_height()
            if height > 0:
                ax.annotate(
                    fmt(height),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha="center",
                    va="bottom",
                    fontsize=9,
                )

    plt.legend(title="Agent")
    plt.tight_layout()
    plt.show()

Example usage:

viz.plot_category_bars(
    df=annotated_df,
    student_col="predicted_labels_student_msg",
    tutor_col="predicted_labels_tutor_msg",
    use_percent=True,
    title="Distribution of Predicted Classes"
)

Parameters:

Name Type Description Default
df DataFrame The input DataFrame containing predicted categories for student and/or tutor. required
student_col str or None Name of the column with student-predicted labels. Optional. None
tutor_col str or None Name of the column with tutor-predicted labels. Optional. None
palette str Color palette used for the plot. Optional. "icefire"
title str Title of the plot. Optional. "Predicted Classes"

Returns:

Name Type Description
None Displays the plot using matplotlib.pyplot.show(). No object is returned.

2. Summary Table

Source code in src/educhateval/descriptive_results/display_results.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def create_prediction_summary_table(df, student_col=None, tutor_col=None):
    if not student_col and not tutor_col:
        raise ValueError("You must provide at least one of student_col or tutor_col.")

    result_dfs = []
    all_categories = set()

    if student_col:
        student_counts = df[student_col].value_counts(dropna=False)
        total = student_counts.sum()
        counts = student_counts.rename("Student (n)")
        percents = ((student_counts / total) * 100).round(1).astype(str) + "%"
        percents.name = "Student (%)"
        merged = pd.concat([counts, percents], axis=1)
        result_dfs.append(merged)
        all_categories.update(merged.index)

    if tutor_col:
        tutor_counts = df[tutor_col].value_counts(dropna=False)
        total = tutor_counts.sum()
        counts = tutor_counts.rename("Tutor (n)")
        percents = ((tutor_counts / total) * 100).round(1).astype(str) + "%"
        percents.name = "Tutor (%)"
        merged = pd.concat([counts, percents], axis=1)
        result_dfs.append(merged)
        all_categories.update(merged.index)

    full_index = pd.Index(sorted(all_categories), name="Predicted Category")
    summary_df = pd.DataFrame(index=full_index)

    for df_part in result_dfs:
        summary_df = summary_df.join(df_part, how="left")

    for col in summary_df.columns:
        if "(n)" in col:
            summary_df[col] = summary_df[col].fillna(0).astype(int)
        elif "(%)" in col:
            summary_df[col] = summary_df[col].fillna("0.0%")

    summary_df = summary_df.reset_index()
    return summary_df

Example usage:

summary = viz.create_summary_table(
    df=annotated_df,
    student_col="predicted_labels_student_msg",
    tutor_col="predicted_labels_tutor_msg"
)

print(summary)

Parameters:

Name Type Description Default
df DataFrame The input DataFrame containing predicted categories for student and/or tutor. required
student_col str or None Name of the column with student-predicted labels. Optional. None
tutor_col str or None Name of the column with tutor-predicted labels. Optional. None

Returns:

Name Type Description
summary_df DataFrame A summary table with counts and percentages for each predicted category. Splits by student and tutor (if provided). Missing values are filled with 0.

3. Predicted Classes by Turns

Source code in src/educhateval/descriptive_results/display_results.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def plot_predicted_categories(
    df,
    student_col=None,
    tutor_col=None,
    use_percent=True,
    palette="icefire",
    title="Predicted Category Frequency",
    show_ci=False,
):
    if not student_col and not tutor_col:
        raise ValueError("You must provide at least one of student_col or tutor_col.")

    # Prepare long format
    long_dfs = []
    if student_col:
        temp = df[["turn", student_col]].copy()
        temp["source"] = "Student"
        temp.rename(columns={student_col: "predicted_label"}, inplace=True)
        long_dfs.append(temp)
    if tutor_col:
        temp = df[["turn", tutor_col]].copy()
        temp["source"] = "Tutor"
        temp.rename(columns={tutor_col: "predicted_label"}, inplace=True)
        long_dfs.append(temp)

    long_df = pd.concat(long_dfs, ignore_index=True)

    all_labels = sorted(long_df["predicted_label"].dropna().unique())
    long_df["predicted_label"] = pd.Categorical(
        long_df["predicted_label"], categories=all_labels, ordered=True
    )

    count_df = (
        long_df.groupby(["turn", "source", "predicted_label"], observed=True)
        .size()
        .reset_index(name="count")
    )

    if use_percent:
        total_per_group = count_df.groupby(["turn", "source"], observed=True)[
            "count"
        ].transform("sum")
        count_df["value"] = (count_df["count"] / total_per_group) * 100
        y_label = "Occurrences (%)"
        fmt = lambda y, _: f"{y:.0f}%"
        y_max = 100
    else:
        count_df["value"] = count_df["count"]
        y_label = "Number of Occurrences"
        fmt = lambda y, _: f"{int(y)}"
        y_max = count_df["value"].max() + 3

    sns.set_style("whitegrid")
    g = sns.relplot(
        data=count_df,
        x="turn",
        y="value",
        hue="predicted_label",
        kind="line",
        col="source" if student_col and tutor_col else None,
        facet_kws={"sharey": True, "sharex": True},
        height=4.5,
        aspect=1.5,
        marker="o",
        palette=palette,
        hue_order=all_labels,
        errorbar=('ci', 95) if show_ci else None,
    )

    if student_col and tutor_col:
        g.set_titles("{col_name} Messages")
    g.set_axis_labels("Turn", y_label)

    g.fig.subplots_adjust(right=0.85)
    g._legend.set_bbox_to_anchor((1.12, 0.5))
    g._legend.set_frame_on(True)
    g._legend.set_title("Predicted Category")

    for ax in g.axes.flat:
        ax.set_ylim(0, y_max)
        ax.yaxis.set_major_formatter(mtick.FuncFormatter(fmt))

    plt.suptitle(title, fontsize=15, fontweight="bold", y=0.95)
    plt.tight_layout()
    plt.show()

Example usage:

plot_predicted_categories(
        df=annotated_df,
        student_col="predicted_labels_student_msg",
        tutor_col="predicted_labels_tutor_msg",
        title="Predicted Category Distribution"
    )

Parameters:

Name Type Description Default
df DataFrame Input DataFrame with turn-level predicted labels for student and/or tutor. required
student_col str or None Name of the column containing student-predicted categories. Optional. None
tutor_col str or None Name of the column containing tutor-predicted categories. Optional. None
use_percent bool Whether to plot percentage values (True) or raw counts (False). True
palette str Color palette used for the plot. Optional. "icefire"
title str Title of the plot. Optional. "Predicted Classes"

Returns:

Name Type Description
None Displays the plot using matplotlib.pyplot.show(). No object is returned.

4. Interaction Distribution

Plot the frequency of predicted categories in the previous turn of the opposite agent. Both student and tutor is required.

Source code in src/educhateval/descriptive_results/display_results.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
def plot_previous_turn_distribution(
    df,
    student_col="predicted_labels_student_msg",
    tutor_col="predicted_labels_tutor_msg",
    focus_agent="student",
    use_percent=True,
    palette="icefire",
    title=None,
):
    """
    Plot the frequency of predicted categories in the previous turn of the *opposite* agent. Both student and tutor is required.
    """

    if not student_col or not tutor_col:
        raise ValueError("Both student_col and tutor_col must be provided.")

    if focus_agent not in ["student", "tutor"]:
        raise ValueError("focus_agent must be either 'student' or 'tutor'.")

    if focus_agent == "student":
        if not student_col or not tutor_col:
            raise ValueError(
                "Both student_col and tutor_col must be provided when focus_agent='student'."
            )
        focus_col = student_col
        opposite_col = tutor_col
        focus_label = "Student"
        opposite_label = "Tutor"
    else:
        if not student_col or not tutor_col:
            raise ValueError(
                "Both student_col and tutor_col must be provided when focus_agent='tutor'."
            )
        focus_col = tutor_col
        opposite_col = student_col
        focus_label = "Tutor"
        opposite_label = "Student"

    # Prepare shifted column
    df_sorted = df.sort_values(by=["student_id", "turn"]).copy()
    df_sorted["prev_opposite_label"] = df_sorted.groupby("student_id")[
        opposite_col
    ].shift(1)
    df_filtered = df_sorted.dropna(subset=[focus_col, "prev_opposite_label"])

    # Count combinations
    grouped = (
        df_filtered.groupby([focus_col, "prev_opposite_label"], observed=True)
        .size()
        .reset_index(name="count")
    )

    if use_percent:
        total_per_focus = grouped.groupby(focus_col, observed=True)["count"].transform(
            "sum"
        )
        grouped["percentage"] = (grouped["count"] / total_per_focus) * 100
        y_col = "percentage"
        y_label = f"Category in Previous Turn for {opposite_label} (%)"
        fmt = lambda val: f"{val:.0f}%"
    else:
        grouped["percentage"] = grouped["count"]
        y_col = "count"
        y_label = f"Category in Previous Turn for {opposite_label} (n)"
        fmt = lambda val: f"{int(val)}"

    # Ensure all category combinations are represented
    focus_vals = sorted(df_filtered[focus_col].dropna().unique())
    prev_vals = sorted(df_filtered["prev_opposite_label"].dropna().unique())
    full_grid = pd.MultiIndex.from_product(
        [focus_vals, prev_vals], names=[focus_col, "prev_opposite_label"]
    ).to_frame(index=False)
    grouped = full_grid.merge(
        grouped, on=[focus_col, "prev_opposite_label"], how="left"
    ).fillna(0)
    grouped["count"] = grouped["count"].astype(int)
    if use_percent:
        grouped["percentage"] = (
            grouped.groupby(focus_col)["count"]
            .transform(lambda x: x / x.sum() * 100)
            .fillna(0)
        )

    grouped = grouped.sort_values(by=[focus_col, "prev_opposite_label"])

    # Plot
    sns.set_style("whitegrid")
    g = sns.catplot(
        data=grouped,
        x=focus_col,
        y=y_col,
        hue="prev_opposite_label",
        kind="bar",
        palette=palette,
        height=6,
        aspect=2.5,
        dodge=True,
        order=focus_vals,
        hue_order=prev_vals,
    )

    # Adjust bar width
    for patch in g.ax.patches:
        patch.set_width(patch.get_width() * 0.9)

    # Labels and title
    g.set_axis_labels(f"Category in Current Turn for {focus_label}", y_label)
    g.fig.suptitle(
        f"Frequency of Interactions: {focus_label} Focus",
        fontsize=15,
        fontweight="bold",
        y=0.99,
    )

    if use_percent:
        g.ax.set_ylim(0, 100)
        g.ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda y, _: f"{y:.0f}%"))

    # Annotate values (including 0s)
    dodge_width = 0.8 / len(prev_vals)
    for i, row in grouped.iterrows():
        x_pos = focus_vals.index(row[focus_col])
        hue_idx = prev_vals.index(row["prev_opposite_label"])
        xpos_shifted = x_pos - 0.4 + dodge_width / 2 + hue_idx * dodge_width
        height = row[y_col]
        g.ax.annotate(
            fmt(height),
            xy=(xpos_shifted, height),
            xytext=(0, 3),
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=9,
        )

    g.fig.subplots_adjust(right=0.85)
    g._legend.set_bbox_to_anchor((1.12, 0.5))
    g._legend.set_frame_on(True)
    g._legend.set_title(f"{opposite_label} Category (Turn - 1)")


    plt.suptitle(title, fontsize=15, fontweight="bold", y=0.95)

    plt.tight_layout()
    plt.show()

Example usage:

viz.plot_history_interaction(
    df=annotated_df,
    student_col="predicted_labels_student_msg",
    tutor_col="predicted_labels_tutor_msg",
    focus_agent="tutor",
    use_percent=True
)

Parameters:

Name Type Description Default
df DataFrame Input DataFrame including turn-level predicted labels for both student and tutor. required
student_col str or None Column name containing student-predicted categories. required
tutor_col str or None Column name containing tutor-predicted categories. required
focus_agent str Determines whether to analyze the student or tutor perspective. Options: "student" or "tutor". "student"
use_percent bool If True, the y-axis will display percentages; otherwise raw counts are shown. True
palette str Color palette used for the plot. Optional. "icefire"

Returns:

Name Type Description
None Displays the plot using matplotlib.pyplot.show(). No object is returned.