Source code for plom.create.classlistValidator

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2018-2024 Andrew Rechnitzer
# Copyright (C) 2019-2025 Colin B. Macdonald
# Copyright (C) 2020 Vala Vakilian
# Copyright (C) 2020 Dryden Wiebe
# Copyright (C) 2024 Aden Chan

from collections import defaultdict
import csv
from pathlib import Path
from typing import Any, Sequence

from plom.rules import validateStudentID

canvas_columns_format = ("Student", "ID", "SIS User ID", "SIS Login ID")



[docs]
class PlomClasslistValidator:
    """The Plom Classlist Validator has methods to help ensure compatible classlists."""


[docs]
    def readClasslist(self, filename: Path | str) -> list[dict[str, Any]]:
        """Read classlist from filename and return as list of dicts.

        Arguments:
            filename: csv-file to be loaded.  It must be UTF-8 encoded, or
                "utf-8-sig".

        Returns:
            List of dictionaries (keys are column titles).  We canonicalize the
            header names so that we have at least ``"id", "name", "paper_number"``
            and ``"_src_line"``, the latter used for error messages in further
            validation.  The ``"paper_number"`` key might or might not be present.

        Raises:
            ValueError: the file does not contain a header line, or the file
                does not contain any of the header names we might expect, or
                there is some other problem with the headers.
            UnicodeDecodeError: encoding troubles, such as when the file isn't
                in UTF-8.
        """
        classAsDicts = []
        # Note newline: https://docs.python.org/3/library/csv.html#id4
        # Note: utf-8-sig is a Microsoft thing, Issue #3200 which AFAICT
        # is harmless for us https://docs.python.org/3/library/codecs.html#encodings-and-unicode
        with open(filename, newline="", encoding="utf-8-sig") as csvfile:
            # Issue #3938: we previously did "Sniffer" stuff here before opening it,
            # not quite sure why, and it eventually caused trouble.
            reader = csv.DictReader(csvfile)

            # check it has a header (I think this can't fail, but keeps MyPy happy)
            if not reader.fieldnames:
                raise ValueError("The CSV file has no header")

            id_key, name_key, paper_number_key = self._checkHeaders(reader.fieldnames)

            # now actually read the entries
            for row in reader:
                row["_src_line"] = reader.line_num
                # canonicalize cases, replacing whatever case was there before
                row["id"] = row.pop(id_key)
                row["name"] = row.pop(name_key)
                if paper_number_key is not None:
                    row["paper_number"] = row.pop(paper_number_key)
                classAsDicts.append(row)
            return classAsDicts


    def _checkHeaders(self, headers: Sequence[str]) -> list[str | None]:
        """Check existence of id and name columns in the classlist.

        Checks the column titles (as given by the supplied row from
        the classlist).  Tests for an id column, name-column, and the
        papernumber column. Names must be a single column. To avoid
        issues with upper and lower case, everything needs to be tested
        by casefolding.

        Arguments:
            headers: the list of keys of the column titles.

        Returns:
            A list of the key names, dict of the form
            ``[id_key, fullname_key, papernumber_key]``.
            If there is no ``"paper_number"`` column, then the
            ``paper_number_key`` will be `None`.

        Raises:
            ValueError: with a message about what column header problem we found.
                You might need to call multiple times to get all the problems:
                this fails fast on the first problem found.
        """
        id_keys = []
        fullname_keys = []
        papernumber_keys: list[str | None] = []
        for x in headers:
            cfx = x.casefold()
            if cfx == "id":
                id_keys.append(x)
            if cfx == "name":
                fullname_keys.append(x)
            if cfx == "paper_number":
                papernumber_keys.append(x)

        # Check for repeated column names, Issue #3667.
        if len(id_keys) > 1:
            raise ValueError(
                f'Column "id" is repeated multiple times in the '
                f'CSV header: {", ".join(x for x in headers)}'
            )
        if len(fullname_keys) > 1:  # must have exactly one such column
            raise ValueError(
                f'Column "name" is repeated multiple times in the '
                f'CSV header: {", ".join(x for x in headers)}'
            )
        if len(papernumber_keys) > 1:
            raise ValueError(
                f'Column "paper_number" is repeated multiple times in the '
                f'CSV header: {", ".join(x for x in headers)}'
            )
        # Must have an id, name and paper_number columns
        if not id_keys:
            raise ValueError(f"Missing 'id' column in columns {headers}")
        if not fullname_keys:
            raise ValueError(f"Missing 'name' column in columns {headers}")
        if not papernumber_keys:
            # Issue #2273
            # raise ValueError("Missing paper_number column")
            papernumber_keys = [None]

        # We explicitly allow casefolding (but could change our minds?)
        # See #3822 and #1140.
        # if id_keys != ["id"]:
        #     raise ValueError(f"'id' present but incorrect case; header: {headers}")
        # if fullname_keys != ["name"]:
        #     raise ValueError(f"'name' present but incorrect case; header: {headers}")

        return [id_keys[0], fullname_keys[0], papernumber_keys[0]]


[docs]
    def check_ID_column(
        self, classlist: list[dict[str, str | int]]
    ) -> tuple[bool, list]:
        """Check the ID column of the classlist."""
        err = []
        ids_used = defaultdict(list)
        for idx, row in enumerate(classlist):
            # this is separate function - will be institution dependent.
            # will be better when we move to UIDs.
            idv = validateStudentID(row["id"])

            where = row.get("_src_line", None)
            if where is None:
                # don't have _src_line, maybe not from csv file, use 1-index
                where = idx + 1

            if idv[0] is False:
                err.append([where, idv[1]])
            ids_used[row["id"]].append(where)
        for x, v in ids_used.items():
            if len(v) > 1:
                if len(str(x)) == 0:  # for #3091 - explicit error for blank ID
                    err.append([v[0], f"Blank ID appears on multiple lines {v}"])
                else:
                    err.append(
                        [v[0], f"ID '{x}' is used multiple times - on lines {v}"]
                    )
        if len(err) > 0:
            return (False, err)
        else:
            return (True, [])



[docs]
    @staticmethod
    def is_paper_number_sentinel(x: int | float | str | None) -> bool:
        """True if the input is None, blank, -1 or '-1'.

        Note: zero is not sentinel.
        """
        return x in ("", None, "-1", -1)



[docs]
    def check_paper_number_column(
        self, classlist: list[dict[str, str | int]]
    ) -> tuple[bool, list]:
        """Check the papernumber column of the classlist.

        Entries must either be blank, or integers >= -1.
        Note that:
            * no integer >=0 can be used twice, and
            * blank or -1 are sentinel values used to indicate 'do not prename'
        """

        def is_an_int(x: int | float | str) -> bool:
            """True if input can be converted to an int."""
            try:
                int(x)
            except (ValueError, TypeError):
                return False
            return True

        def is_nearly_a_non_negative_int(x: int | float | str) -> bool:
            """True input can be converted to a non-negative float which has integer value.

            eg - returns true for "1.0", but false for "0.9", "-2", "-2.1", "13.2" and so on.
            """
            try:
                v = float(x)
            except (ValueError, TypeError):
                return False
            return (int(v) == v) and (v >= 0)

        err = []
        numbers_used = defaultdict(list)
        for idx, row in enumerate(classlist):
            pn = row.get("paper_number", None)
            # see #3099 - we can reuse papernum = -1 since it is a sentinel value, so ignore any -1's
            if self.is_paper_number_sentinel(pn):
                continue  # notice that this handles pn being None.
            assert pn is not None

            where = row.get("_src_line", None)
            if where is None:
                # don't have _src_line, maybe not from csv file, use 1-index
                where = idx + 1

            if is_an_int(pn):
                if int(pn) < 0:
                    err.append(
                        [
                            where,
                            f"Paper-number {pn} must be a non-negative integer, "
                            "or blank or '-1' to indicate 'do not prename'",
                        ]
                    )
            else:
                if is_nearly_a_non_negative_int(pn):
                    err.append(
                        [
                            where,
                            f"Paper-number {pn} is nearly, but not quite, a non-negative integer",
                        ]
                    )
                    continue
                else:
                    err.append(
                        [
                            where,
                            f"Paper-number {pn} is not a non-negative integer",
                        ]
                    )
                    continue

            # otherwise store the used papernumber.
            numbers_used[pn].append(where)
        for x, v in numbers_used.items():
            if len(v) > 1:
                err.append(
                    [v[0], f"Paper-number '{x}' is used multiple times - on lines {v}"]
                )
        if len(err) > 0:
            return (False, err)
        else:
            return (True, [])



[docs]
    def check_name_column(self, classlist: list[dict[str, str | int]]) -> list:
        """Check name column return any warnings."""
        warn = []
        for idx, x in enumerate(classlist):
            where = x.get("_src_line", None)
            if where is None:
                # don't have _src_line, maybe not from csv file, use 1-index
                where = idx + 1

            tmp = x["name"]
            # check non-trivial length after removing spaces and commas
            if not isinstance(tmp, str):
                warn.append([where, f'Name should be str, but "{tmp}" is {type(tmp)}'])
                continue
            tmp = tmp.replace(" ", "").replace(",", "")
            # warn if name-field is very short
            if len(tmp) < 2:  # TODO - decide a better bound here
                warn.append([where, f"Name '{tmp}' is very short  - please verify."])
        return warn



[docs]
    def validate_csv(
        self, filename: Path | str
    ) -> tuple[bool, list[dict[str, Any]], list[dict[str, Any]]]:
        """Validate the classlist csv and return summaries of any errors and warnings.

        Args:
            filename: a csv file from which to try to load the classlist.
                It must be UTF-8-encoded.  Microsoft's "utf-8-sig" with
                "FEFF" byte-order-mark is also reluctantly accepted.

        Returns:
            ``(valid, warnings_and_errors, cl_as_list_of_dicts)`` where
            "valid" is True/False, "warnings_and_errors" is a list of dicts
            and "cl_as_dicts" is a list of dicts of the actual classlist,
            with canonicalized fieldnames, at least "id", "name", "paper_number".
            In the 2nd output, each dict encodes a single warning or an error:
            see code for precise format.  It is possible for "valid" to be True
            and still have non-empty "warnings_and_errors" for example
            when there are only warnings.
        """
        werr = []
        try:
            cl_as_dicts = self.readClasslist(filename)
        except UnicodeDecodeError as err:
            errstr = (
                "This file has the wrong encoding: "
                "you may need to export it as a UTF-8 file.  "
                f"{err.__class__.__name__}: {err}"
            )
            # TODO: we could extend the werr structure to include "more info"
            # url = "https://plom.readthedocs.io/en/latest/faq.html#how-do-i-make-a-utf-8-csv-file"
            werr.append(
                {
                    "warn_or_err": "error",
                    "werr_line": 0,
                    "werr_text": errstr,
                    # "werr_more_info_url": url,
                }
            )
            return (False, werr, [])
        except (ValueError, FileNotFoundError) as err:
            errstr = f"{err.__class__.__name__}: {err}"
            werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": errstr})
            return (False, werr, [])
        except Exception as err:
            e = f"Some other sort of error reading {filename}: {type(err)} {err}"
            werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": e})
            return (False, werr, [])

        if len(cl_as_dicts) == 0:
            # Headers were OK, followed by no data. That's degenerate, but valid.
            e = "CSV file seems to be empty (headers only)"
            werr.append({"warn_or_err": "warn", "werr_line": 0, "werr_text": e})

        valid, _werr2 = self.validate(cl_as_dicts)
        werr.extend(_werr2)
        return (valid, werr, cl_as_dicts)



[docs]
    def validate(
        self, cl_as_dicts: list[dict[str, Any]]
    ) -> tuple[bool, list[dict[str, Any]]]:
        """Validate a proposed classlist and return summaries of any errors and warnings.

        Args:
            cl_as_dicts: a list of dicts with fields "id", "name",
                and optionally "paper_number".

        Returns:
            ``(valid, warnings_and_errors)`` as described in :method:`validate_csv`.
        """
        werr = []
        # collect all errors and warnings before bailing out.
        validity = True

        for row_idx, row in enumerate(cl_as_dicts):
            for key in ("id", "name"):
                if key not in row.keys():
                    validity = False
                    werr.append(
                        {
                            "warn_or_err": "error",
                            "werr_line": row_idx,
                            "werr_text": f'Missing "{key}" column',
                        }
                    )
                if not validity:
                    # bail early as later tests rely on key names
                    return (validity, werr)

        # check the ID column - again, potentially errors here (not just warnings)
        success, errors = self.check_ID_column(cl_as_dicts)
        if not success:  # format errors and set invalid
            validity = False
            for e in errors:
                werr.append(
                    {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]}
                )

        # check the paperNumber column - again, potentially errors here (not just warnings)
        success, errors = self.check_paper_number_column(cl_as_dicts)
        if not success:  # format errors and set invalid
            validity = False
            for e in errors:
                werr.append(
                    {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]}
                )

        # check the name column - only warnings returned
        for w in self.check_name_column(cl_as_dicts):
            werr.append(
                {"warn_or_err": "warning", "werr_line": w[0], "werr_text": w[1]}
            )

        return (validity, werr)



[docs]
    def check_is_canvas_csv(self, csv_file_name: Path | str) -> bool:
        """Detect if a csv file is likely a Canvas-exported classlist.

        Arguments:
            csv_file_name: csv file to be checked.

        Returns:
            True if we think the input was from Canvas, based on
            presence of certain header names.  Otherwise False.
        """
        # Note newline: https://docs.python.org/3/library/csv.html#id4
        with open(csv_file_name, newline="") as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            csv_fields = csv_reader.fieldnames
            if csv_fields is None:
                csv_fields = []
        return all(x in csv_fields for x in canvas_columns_format)



[docs]
    def check_is_non_canvas_csv(self, csv_file_name: Path | str) -> bool:
        """Read the csv file and check if id and name columns exist.

        1. Check if id is present.
        2. Check if name is preset.

        Arguments:
            csv_file_name: the csv file.

        Returns:
            bool
        """
        print(f'Loading from non-Canvas csv file to check file: "{csv_file_name}"')
        # Note newline: https://docs.python.org/3/library/csv.html#id4
        with open(csv_file_name, newline="") as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            column_names = csv_reader.fieldnames
            if column_names is None:
                column_names = []
        # strip excess whitespace from column names to avoid issues with blanks
        column_names = [str(x).strip() for x in column_names]

        id_cols = []
        fullname_cols = []
        papernumber_cols = []
        for x in column_names:
            cfx = x.casefold()
            print(">>>> checking ", cfx)
            if cfx == "id":
                id_cols.append(x)
            if cfx == "name":
                fullname_cols.append(x)
            if cfx == "paper_number":
                papernumber_cols.append(x)

        if not id_cols:
            print(f"Cannot find an id column - {id_cols}")
            print(f"Columns present = {column_names}")
            return False
        elif len(id_cols) > 1:
            print(f"Multiple id columns - {id_cols}")
            print(f"Columns present = {column_names}")
            return False

        if not fullname_cols:
            print(f"Cannot find an name column - {fullname_cols}")
            print(f"Columns present = {column_names}")
            return False
        elif len(fullname_cols) > 1:
            print(f"Multiple name columns - {fullname_cols}")
            print(f"Columns present = {column_names}")
            return False

        if not papernumber_cols:
            # Issue #2273
            # print(f"Cannot find a paper number column - {papernumber_cols}")
            # print(f"Columns present = {column_names}")
            # return False
            pass
        elif len(papernumber_cols) > 1:
            print("Multiple paper number columns - {papernumber_cols}")
            print(f"Columns present = {column_names}")
            return False

        return True


    @classmethod
    def print_classlist_warnings_errors(cls, warn_err: list[dict[str, Any]]) -> None:
        # separate into warn and err
        warn = [X for X in warn_err if X["warn_or_err"] == "warning"]
        err = [X for X in warn_err if X["warn_or_err"] != "warning"]
        # sort by line number
        warn.sort(key=lambda X: X["werr_line"])
        err.sort(key=lambda X: X["werr_line"])

        if warn:
            print("Classlist validation warnings:")
            for X in warn:
                print(f"\tline {X['werr_line']}: {X['werr_text']}")
        if err:
            print("Classlist validation errors:")
            for X in err:
                print(f"\tline {X['werr_line']}: {X['werr_text']}")