Source code for plom.create.classlistValidator

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2018-2024 Andrew Rechnitzer
# Copyright (C) 2019-2025 Colin B. Macdonald
# Copyright (C) 2020 Vala Vakilian
# Copyright (C) 2020 Dryden Wiebe
# Copyright (C) 2024 Aden Chan

from collections import defaultdict
import csv
from pathlib import Path
from typing import Any, Sequence

from plom.rules import validateStudentID

# important classlist headers - all casefolded
sid_field = "id".casefold()
fullname_field = "name".casefold()
papernumber_field = "paper_number".casefold()

canvas_columns_format = ("Student", "ID", "SIS User ID", "SIS Login ID")


[docs] class PlomClasslistValidator: """The Plom Classlist Validator has methods to help ensure compatible classlists."""
[docs] def readClassList(self, filename: Path | str) -> list[dict[str, Any]]: """Read classlist from filename and return as list of dicts. Arguments: filename: csv-file to be loaded. It must be UTF-8-encoded, or "utf-8-sig". Returns: List of dictionaries (keys are column titles). We canonicalize the header names so that we have at least ``"id", "name", "paper_number"`` and ``"_src_line"``, the latter used for error messages in further validation. The ``"paper_number"`` key might or might not be present. Raises: ValueError: the file does not contain a header line, or the file does not contain any of the header names we might expect, or there is some other problem with the headers. """ classAsDicts = [] # Note newline: https://docs.python.org/3/library/csv.html#id4 # Note: utf-8-sig is a Microsoft thing, Issue #3200 which AFAICT # is harmless for us https://docs.python.org/3/library/codecs.html#encodings-and-unicode with open(filename, newline="", encoding="utf-8-sig") as csvfile: # Issue #3938: we previously did "Sniffer" stuff here before opening it, # not quite sure why, and it eventually caused trouble. reader = csv.DictReader(csvfile) # check it has a header (I think this can't fail, but keeps MyPy happy) if not reader.fieldnames: raise ValueError("The CSV file has no header") id_key, name_key, paper_number_key = self._checkHeaders(reader.fieldnames) # now actually read the entries for row in reader: row["_src_line"] = reader.line_num # canonicalize cases, replacing whatever case was there before row[sid_field] = row.pop(id_key) row[fullname_field] = row.pop(name_key) if paper_number_key is not None: row[papernumber_field] = row.pop(paper_number_key) classAsDicts.append(row) return classAsDicts
def _checkHeaders(self, headers: Sequence[str]) -> list[str | None]: """Check existence of id and name columns in the classlist. Checks the column titles (as given by the supplied row from the classlist). Tests for an id column, name-column, and the papernumber column. Names must be a single column. To avoid issues with upper and lower case, everything needs to be tested by casefolding. Arguments: headers: the list of keys of the column titles. Returns: A list of the key names, dict of the form ``[id_key, fullname_key, papernumber_key]``. If there is no ``"paper_number"`` column, then the ``paper_number_key`` will be `None`. Raises: ValueError: with a message about what column header problem we found. You might need to call multiple times to get all the problems: this fails fast on the first problem found. """ id_keys = [] fullname_keys = [] papernumber_keys: list[str | None] = [] for x in headers: cfx = x.casefold() if cfx == sid_field: id_keys.append(x) if cfx == fullname_field: fullname_keys.append(x) if cfx == papernumber_field: papernumber_keys.append(x) # Check for repeated column names, Issue #3667. if len(id_keys) > 1: raise ValueError( f'Column "id" is repeated multiple times in the ' f'CSV header: {", ".join(x for x in headers)}' ) if len(fullname_keys) > 1: # must have exactly one such column raise ValueError( f'Column "name" is repeated multiple times in the ' f'CSV header: {", ".join(x for x in headers)}' ) if len(papernumber_keys) > 1: raise ValueError( f'Column "paper_number" is repeated multiple times in the ' f'CSV header: {", ".join(x for x in headers)}' ) # Must have an id, name and paper_number columns if not id_keys: raise ValueError(f"Missing 'id' column in columns {headers}") if not fullname_keys: raise ValueError(f"Missing 'name' column in columns {headers}") if not papernumber_keys: # Issue #2273 # raise ValueError("Missing paper_number column") papernumber_keys = [None] # We explicitly allow casefolding (but could change our minds?) # See #3822 and #1140. # if id_keys != [sid_field]: # raise ValueError(f"'id' present but incorrect case; header: {headers}") # if fullname_keys != [fullname_field]: # raise ValueError(f"'name' present but incorrect case; header: {headers}") return [id_keys[0], fullname_keys[0], papernumber_keys[0]]
[docs] def check_ID_column(self, id_key, classList) -> tuple[bool, list]: """Check the ID column of the classlist.""" err = [] ids_used = defaultdict(list) for x in classList: # this is separate function - will be institution dependent. # will be better when we move to UIDs. idv = validateStudentID(x[id_key]) if idv[0] is False: err.append([x["_src_line"], idv[1]]) ids_used[x[id_key]].append(x["_src_line"]) for x, v in ids_used.items(): if len(v) > 1: if len(str(x)) == 0: # for #3091 - explicit error for blank ID err.append([v[0], f"Blank ID appears on multiple lines {v}"]) else: err.append( [v[0], f"ID '{x}' is used multiple times - on lines {v}"] ) if len(err) > 0: return (False, err) else: return (True, [])
[docs] @staticmethod def is_paper_number_sentinel(x: int | float | str | None) -> bool: """True if the input is None, blank, -1 or '-1'. Note: zero is not sentinel. """ return x in ("", None, "-1", -1)
[docs] def check_paper_number_column(self, papernum_key, classList) -> tuple[bool, list]: """Check the papernumber column of the classlist. Entries must either be blank, or integers >= -1. Note that: * no integer >=0 can be used twice, and * blank or -1 are sentinel values used to indicate 'do not prename' """ def is_an_int(x: int | float | str) -> bool: """True if input can be converted to an int.""" try: int(x) except ValueError: return False return True def is_nearly_a_non_negative_int(x: int | float | str) -> bool: """True input can be converted to a non-negative float which has integer value. eg - returns true for "1.0", but false for "0.9", "-2", "-2.1", "13.2" and so on. """ try: v = float(x) except ValueError: return False return (int(v) == v) and (v >= 0) err = [] numbers_used = defaultdict(list) for x in classList: pn = x.get(papernum_key, None) # see #3099 - we can reuse papernum = -1 since it is a sentinel value, so ignore any -1's if self.is_paper_number_sentinel(pn): continue # notice that this handles pn being None. if is_an_int(pn): if int(pn) < 0: err.append( [ x["_src_line"], f"Paper-number {x[papernum_key]} must be a non-negative integer, or blank or '-1' to indicate 'do not prename'", ] ) else: if is_nearly_a_non_negative_int(x[papernum_key]): err.append( [ x["_src_line"], f"Paper-number {x[papernum_key]} is nearly, but not quite, a non-negative integer", ] ) continue else: err.append( [ x["_src_line"], f"Paper-number {x[papernum_key]} is not a non-negative integer", ] ) continue # otherwise store the used papernumber. numbers_used[x[papernum_key]].append(x["_src_line"]) for x, v in numbers_used.items(): if len(v) > 1: err.append( [v[0], f"Paper-number '{x}' is used multiple times - on lines {v}"] ) if len(err) > 0: return (False, err) else: return (True, [])
[docs] def check_name_column(self, fullname_key, classList) -> list: """Check name column return any warnings.""" warn = [] for x in classList: # check non-trivial length after removing spaces and commas tmp = x[fullname_key].replace(" ", "").replace(",", "") # warn if name-field is very short if len(tmp) < 2: # TODO - decide a better bound here warn.append( [x["_src_line"], f"Name '{tmp}' is very short - please verify."] ) return warn
[docs] def check_classlist_against_spec(self, spec, classlist_length: int) -> list[str]: """Validate the classlist-length against spec parameters. Args: spec (None/dict/SpecVerifier): an optional test specification, if given then run additional classlist-related tests. classlist_length: the number of students in the classlist. Returns: If 'numberToProduce' is positive but less than classlist_length then returns [warning_message], else returns empty list. """ if spec is None: return [] elif spec["numberToProduce"] == -1: return [] elif spec["numberToProduce"] < classlist_length: return [ f"Classlist is long. Classlist contains {classlist_length} names, but spec:numberToProduce is {spec['numberToProduce']}" ] return []
[docs] def validate_csv( self, filename: Path | str, *, spec=None ) -> tuple[bool, list[dict[str, Any]], list[dict[str, Any]]]: """Validate the classlist csv and return summaries of any errors and warnings. Args: filename: a csv file from which to try to load the classlist. It must be UTF-8-encoded. Microsoft's "utf-8-sig" with "FEFF" byte-order-mark is also reluctantly accepted. Keyword Args: spec (None/dict/SpecVerifier): an optional test specification, if given then run additional classlist-related tests. Returns: ``(valid, warnings_and_errors, cl_as_list_of_dicts)`` where "valid" is True/False, "warnings_and_errors" is a list of dicts and "cl_as_dicts" is a list of dicts of the actual classlist, with canonicalized fieldnames, at least "id", "name", "paper_number". In the 2nd output, each dict encodes a single warning or an error: see code for precise format. It is possible for "valid" to be True and still have non-empty "warnings_and_errors" for example when there are only warnings. """ werr = [] try: cl_as_dicts = self.readClassList(filename) except (ValueError, FileNotFoundError) as err: werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": f"{err}"}) return (False, werr, []) except Exception as err: e = f"Some other sort of error reading {filename}: {type(err)} {err}" werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": e}) return (False, werr, []) if len(cl_as_dicts) == 0: # Headers were OK, followed by no data. That's degenerate, but valid. e = "CSV file seems to be empty (headers only)" werr.append({"warn_or_err": "warn", "werr_line": 0, "werr_text": e}) # collect all errors and warnings before bailing out. validity = True # check the ID column - again, potentially errors here (not just warnings) success, errors = self.check_ID_column(sid_field, cl_as_dicts) if not success: # format errors and set invalid validity = False for e in errors: werr.append( {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]} ) # check the paperNumber column - again, potentially errors here (not just warnings) success, errors = self.check_paper_number_column(papernumber_field, cl_as_dicts) if not success: # format errors and set invalid validity = False for e in errors: werr.append( {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]} ) # check against spec - only warnings returned for w in self.check_classlist_against_spec(spec, len(cl_as_dicts)): werr.append({"warn_or_err": "warning", "werr_line": 0, "werr_text": w}) # check the name column - only warnings returned for w in self.check_name_column(fullname_field, cl_as_dicts): werr.append( {"warn_or_err": "warning", "werr_line": w[0], "werr_text": w[1]} ) return (validity, werr, cl_as_dicts)
[docs] def check_is_canvas_csv(self, csv_file_name: Path | str) -> bool: """Detect if a csv file is likely a Canvas-exported classlist. Arguments: csv_file_name: csv file to be checked. Returns: True if we think the input was from Canvas, based on presence of certain header names. Otherwise False. """ # Note newline: https://docs.python.org/3/library/csv.html#id4 with open(csv_file_name, newline="") as f: csv_reader = csv.DictReader(f, skipinitialspace=True) csv_fields = csv_reader.fieldnames if csv_fields is None: csv_fields = [] return all(x in csv_fields for x in canvas_columns_format)
[docs] def check_is_non_canvas_csv(self, csv_file_name: Path | str) -> bool: """Read the csv file and check if id and name columns exist. 1. Check if id is present or any of possible_sid_fields. 2. Check if name is preset or any of possible_fullname_fields. Arguments: csv_file_name: the csv file. Returns: bool """ print(f'Loading from non-Canvas csv file to check file: "{csv_file_name}"') # Note newline: https://docs.python.org/3/library/csv.html#id4 with open(csv_file_name, newline="") as f: csv_reader = csv.DictReader(f, skipinitialspace=True) column_names = csv_reader.fieldnames if column_names is None: column_names = [] # strip excess whitespace from column names to avoid issues with blanks column_names = [str(x).strip() for x in column_names] id_cols = [] fullname_cols = [] papernumber_cols = [] for x in column_names: cfx = x.casefold() print(">>>> checking ", cfx) if cfx == sid_field: id_cols.append(x) if cfx == fullname_field: fullname_cols.append(x) if cfx == papernumber_field: papernumber_cols.append(x) if not id_cols: print(f"Cannot find an id column - {id_cols}") print(f"Columns present = {column_names}") return False elif len(id_cols) > 1: print(f"Multiple id columns - {id_cols}") print(f"Columns present = {column_names}") return False if not fullname_cols: print(f"Cannot find an name column - {fullname_cols}") print(f"Columns present = {column_names}") return False elif len(fullname_cols) > 1: print("Multiple name columns - {fullname_cols}") print(f"Columns present = {column_names}") return False if not papernumber_cols: # Issue #2273 # print(f"Cannot find a paper number column - {papernumber_cols}") # print(f"Columns present = {column_names}") # return False pass elif len(papernumber_cols) > 1: print("Multiple paper number columns - {papernumber_cols}") print(f"Columns present = {column_names}") return False return True
@classmethod def print_classlist_warnings_errors(cls, warn_err: list[dict[str, Any]]) -> None: # separate into warn and err warn = [X for X in warn_err if X["warn_or_err"] == "warning"] err = [X for X in warn_err if X["warn_or_err"] != "warning"] # sort by line number warn.sort(key=lambda X: X["werr_line"]) err.sort(key=lambda X: X["werr_line"]) if warn: print("Classlist validation warnings:") for X in warn: print(f"\tline {X['werr_line']}: {X['werr_text']}") if err: print("Classlist validation errors:") for X in err: print(f"\tline {X['werr_line']}: {X['werr_text']}")