Source code for plom.create.classlistValidator

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2018-2024 Andrew Rechnitzer
# Copyright (C) 2019-2024 Colin B. Macdonald
# Copyright (C) 2020 Vala Vakilian
# Copyright (C) 2020 Dryden Wiebe
# Copyright (C) 2024 Aden Chan

from __future__ import annotations

from collections import defaultdict
import csv
from pathlib import Path
from typing import Any

from plom.rules import validateStudentID

# important classlist headers - all casefolded
sid_field = "id".casefold()
fullname_field = "name".casefold()
papernumber_field = "paper_number".casefold()

canvas_columns_format = ("Student", "ID", "SIS User ID", "SIS Login ID")
# combine all of these potential column headers into one casefolded list
potential_column_names = [
    sid_field,
    fullname_field,
    papernumber_field,
] + [x.casefold() for x in canvas_columns_format]



[docs]
class PlomClasslistValidator:
    """The Plom Classlist Validator has methods to help ensure compatible classlists."""


[docs]
    def readClassList(self, filename: Path | str) -> list[dict[str, Any]]:
        """Read classlist from filename and return as list of dicts.

        Arguments:
            filename: csv-file to be loaded.

        Returns:
            List of dictionaries (keys are column titles).

        Raises:
            ValueError: the file does not contain a header line, or if the file
                does not contain any of the header names we might expect.
        """
        classAsDict = []
        with open(filename) as csvfile:
            # look at start of file to guess 'dialect', and then return to start of file
            sample = csvfile.read(1024)
            csvfile.seek(0)
            # guess the dialect
            dialect = csv.Sniffer().sniff(sample)
            # build the dict_reader
            reader = csv.DictReader(csvfile, dialect=dialect)

            # check it has a header - csv.sniffer.has_header is a bit flakey
            # instead check that we have some of the potential keys - careful of case
            if not reader.fieldnames:
                raise ValueError("The CSV file has no header")
            column_names = [x.casefold() for x in reader.fieldnames]
            if any(x in potential_column_names for x in column_names):
                pass
                # print("Appears to have reasonable header - continuing.")
            else:
                raise ValueError("The CSV header has no fields that Plom recognises")

            # Check for repeated column names, Issue #3667.
            # (cannot be in checkHeaders because DictReader picks one silently)
            for x in potential_column_names:
                if sum([x == y for y in column_names]) > 1:
                    raise ValueError(
                        f'Column "{x}" is repeated multiple times in the '
                        f'CSV header: {", ".join(x for x in reader.fieldnames)}'
                    )

            # now actually read the entries
            for row in reader:
                row["_src_line"] = reader.line_num
                classAsDict.append(row)
            # return the list
            return classAsDict



[docs]
    def checkHeaders(self, rowFromDict: dict[str, Any]) -> dict[str, Any]:
        """Check existence of id and name columns in the classlist.

        Checks the column titles (as given by the supplied row from
        the classlist).  Tests for an id column, name-column, and the
        papernumber column. Names must be a single column. To avoid
        issues with upper and lower case, everything needs to be tested
        by casefolding.

        Arguments:
            rowFromDict: a row from the classlist encoded as a dictionary.
                The keys give the column titles.

        Returns:
            dict: If errors then return ``{'success': False, 'errors': error-list}``,
            else return ``{'success': True, 'id': id_key, 'fullname': fullname_key, 'papernumber': papernumber_key}``.
            If there is no ``"paper_number"`` column, then the
            ``paper_number_key`` will be `None`.
        """
        id_keys = []
        fullname_keys = []
        papernumber_keys: list[str | None] = []
        headers = list(rowFromDict.keys())
        for x in headers:
            cfx = x.casefold()
            if cfx == sid_field:
                id_keys.append(x)
            if cfx == fullname_field:
                fullname_keys.append(x)
            if cfx == papernumber_field:
                papernumber_keys.append(x)

        err = []
        # Must have at most one of each column
        if len(id_keys) > 1:
            err.append("Cannot have multiple id columns")
        if len(fullname_keys) > 1:  # must have exactly one such column
            err.append("Cannot have multiple name columns")
        if len(papernumber_keys) > 1:
            err.append("Cannot have multiple paper number columns")
        # Must have an id, name and paper_number columns
        if not id_keys:
            err.append(f"Missing 'id' column in columns {headers}")
        if not fullname_keys:
            err.append(f"Missing 'name' column in columns {headers}")
        if not papernumber_keys:
            # Issue #2273
            # err.append("Missing paper number column")
            papernumber_keys = [None]

        if err:
            return {"success": False, "errors": err}
        return {
            "success": True,
            "id": id_keys[0],
            "name": fullname_keys[0],
            "papernumber": papernumber_keys[0],
        }



[docs]
    def check_ID_column(self, id_key, classList) -> tuple[bool, list]:
        """Check the ID column of the classlist."""
        err = []
        ids_used = defaultdict(list)
        for x in classList:
            # this is separate function - will be institution dependent.
            # will be better when we move to UIDs.
            idv = validateStudentID(x[id_key])
            if idv[0] is False:
                err.append([x["_src_line"], idv[1]])
            ids_used[x[id_key]].append(x["_src_line"])
        for x, v in ids_used.items():
            if len(v) > 1:
                if len(str(x)) == 0:  # for #3091 - explicit error for blank ID
                    err.append([v[0], f"Blank ID appears on multiple lines {v}"])
                else:
                    err.append(
                        [v[0], f"ID '{x}' is used multiple times - on lines {v}"]
                    )
        if len(err) > 0:
            return (False, err)
        else:
            return (True, [])



[docs]
    @staticmethod
    def is_paper_number_sentinel(x: int | float | str | None) -> bool:
        """True if the input is None, blank, -1 or '-1'.

        Note: zero is not sentinel.
        """
        return x in ("", None, "-1", -1)



[docs]
    def check_papernumber_column(self, papernum_key, classList) -> tuple[bool, list]:
        """Check the papernumber column of the classlist.

        Entries must either be blank, or integers >= -1.
        Note that:
            * no integer >=0 can be used twice, and
            * blank or -1 are sentinel values used to indicate 'do not prename'
        """

        def is_an_int(x: int | float | str) -> bool:
            """True if input can be converted to an int."""
            try:
                int(x)
            except ValueError:
                return False
            return True

        def is_nearly_a_non_negative_int(x: int | float | str) -> bool:
            """True input can be converted to a non-negative float which has integer value.

            eg - returns true for "1.0", but false for "0.9", "-2", "-2.1", "13.2" and so on.
            """
            try:
                v = float(x)
            except ValueError:
                return False
            return (int(v) == v) and (v >= 0)

        err = []
        numbers_used = defaultdict(list)
        for x in classList:
            pn = x[papernum_key]
            # see #3099 - we can reuse papernum = -1 since it is a sentinel value, so ignore any -1's
            if self.is_paper_number_sentinel(pn):
                continue  # notice that this handles pn being None.
            if is_an_int(pn):
                if int(pn) < 0:
                    err.append(
                        [
                            x["_src_line"],
                            f"Paper-number {x[papernum_key]} must be a non-negative integer, or blank or '-1' to indicate 'do not prename'",
                        ]
                    )
            else:
                if is_nearly_a_non_negative_int(x[papernum_key]):
                    err.append(
                        [
                            x["_src_line"],
                            f"Paper-number {x[papernum_key]} is nearly, but not quite, a non-negative integer",
                        ]
                    )
                    continue
                else:
                    err.append(
                        [
                            x["_src_line"],
                            f"Paper-number {x[papernum_key]} is not a non-negative integer",
                        ]
                    )
                    continue

            # otherwise store the used papernumber.
            numbers_used[x[papernum_key]].append(x["_src_line"])
        for x, v in numbers_used.items():
            if len(v) > 1:
                err.append(
                    [v[0], f"Paper-number '{x}' is used multiple times - on lines {v}"]
                )
        if len(err) > 0:
            return (False, err)
        else:
            return (True, [])



[docs]
    def check_name_column(self, fullname_key, classList) -> list:
        """Check name column return any warnings."""
        warn = []
        for x in classList:
            # check non-trivial length after removing spaces and commas
            tmp = x[fullname_key].replace(" ", "").replace(",", "")
            # warn if name-field is very short
            if len(tmp) < 2:  # TODO - decide a better bound here
                warn.append(
                    [x["_src_line"], f"Name '{tmp}' is very short  - please verify."]
                )
        return warn



[docs]
    def check_classlist_against_spec(self, spec, classlist_length: int) -> list[str]:
        """Validate the classlist-length against spec parameters.

        Args:
            spec (None/dict/SpecVerifier): an optional test specification,
                if given then run additional classlist-related tests.
            classlist_length: the number of students in the classlist.

        Returns:
            If 'numberToProduce' is positive but less than classlist_length
            then returns [warning_message], else returns empty list.
        """
        if spec is None:
            return []
        elif spec["numberToProduce"] == -1:
            return []
        elif spec["numberToProduce"] < classlist_length:
            return [
                f"Classlist is long. Classlist contains {classlist_length} names, but spec:numberToProduce is {spec['numberToProduce']}"
            ]
        return []



[docs]
    def validate_csv(
        self, filename: Path | str, *, spec=None
    ) -> tuple[bool, list[dict[str, Any]]]:
        """Validate the classlist csv and return summaries of any errors and warnings.

        Args:
            filename: a csv file from which to try to load the classlist.

        Keyword Args:
            spec (None/dict/SpecVerifier): an optional test specification,
                 if given then run additional classlist-related tests.

        Returns:
            ``(valid, warnings_and_errors)`` where "valid" is either
            True or False and "warnings_and_errors" is a list of
            dicts.  Each dict encodes a single warning or an error: see
            code for precise format.  It is possible for "valid" to be True
            and still have non-empty "warnings_and_errors" for example
            when there are only warnings.
        """
        werr = []
        try:
            cl_as_dicts = self.readClassList(filename)
        except (ValueError, FileNotFoundError) as err:
            werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": f"{err}"})
            return (False, werr)
        except Exception as err:
            e = f"Some other sort of error reading {filename}: {err}"
            werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": e})
            return (False, werr)

        # check the headers - potentially un-recoverable errors here
        cl_header_info = self.checkHeaders(cl_as_dicts[0])
        if cl_header_info["success"] is False:  # format errors and bail-out
            for e in cl_header_info["errors"]:
                werr.append({"warn_or_err": "error", "werr_line": 0, "werr_text": e})
            return (False, werr)

        # collect all errors and warnings before bailing out.
        validity = True
        # check the ID column - again, potentially errors here (not just warnings)
        success, errors = self.check_ID_column(cl_header_info["id"], cl_as_dicts)
        if not success:  # format errors and set invalid
            validity = False
            for e in errors:
                werr.append(
                    {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]}
                )

        # check the paperNumber column - again, potentially errors here (not just warnings)
        if cl_header_info["papernumber"] is not None:
            success, errors = self.check_papernumber_column(
                cl_header_info["papernumber"], cl_as_dicts
            )
            if not success:  # format errors and set invalid
                validity = False
                for e in errors:
                    werr.append(
                        {"warn_or_err": "error", "werr_line": e[0], "werr_text": e[1]}
                    )

        # check against spec - only warnings returned
        for w in self.check_classlist_against_spec(spec, len(cl_as_dicts)):
            werr.append({"warn_or_err": "warning", "werr_line": 0, "werr_text": w})
        # check the name column - only warnings returned
        for w in self.check_name_column(cl_header_info["name"], cl_as_dicts):
            werr.append(
                {"warn_or_err": "warning", "werr_line": w[0], "werr_text": w[1]}
            )

        return (validity, werr)



[docs]
    def check_is_canvas_csv(self, csv_file_name: Path | str) -> bool:
        """Detect if a csv file is likely a Canvas-exported classlist.

        Arguments:
            csv_file_name: csv file to be checked.

        Returns:
            True if we think the input was from Canvas, based on
            presence of certain header names.  Otherwise False.
        """
        with open(csv_file_name) as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            csv_fields = csv_reader.fieldnames
            if csv_fields is None:
                csv_fields = []
        return all(x in csv_fields for x in canvas_columns_format)



[docs]
    def check_is_non_canvas_csv(self, csv_file_name: Path | str) -> bool:
        """Read the csv file and check if id and name columns exist.

        1. Check if id is present or any of possible_sid_fields.
        2. Check if name is preset or any of possible_fullname_fields.

        Arguments:
            csv_file_name: the csv file.

        Returns:
            bool
        """
        print(f'Loading from non-Canvas csv file to check file: "{csv_file_name}"')
        with open(csv_file_name) as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            column_names = csv_reader.fieldnames
            if column_names is None:
                column_names = []
        # strip excess whitespace from column names to avoid issues with blanks
        column_names = [str(x).strip() for x in column_names]

        id_cols = []
        fullname_cols = []
        papernumber_cols = []
        for x in column_names:
            cfx = x.casefold()
            print(">>>> checking ", cfx)
            if cfx == sid_field:
                id_cols.append(x)
            if cfx == fullname_field:
                fullname_cols.append(x)
            if cfx == papernumber_field:
                papernumber_cols.append(x)

        if not id_cols:
            print(f"Cannot find an id column - {id_cols}")
            print(f"Columns present = {column_names}")
            return False
        elif len(id_cols) > 1:
            print(f"Multiple id columns - {id_cols}")
            print(f"Columns present = {column_names}")
            return False

        if not fullname_cols:
            print(f"Cannot find an name column - {fullname_cols}")
            print(f"Columns present = {column_names}")
            return False
        elif len(fullname_cols) > 1:
            print("Multiple name columns - {fullname_cols}")
            print(f"Columns present = {column_names}")
            return False

        if not papernumber_cols:
            # Issue #2273
            # print(f"Cannot find a paper number column - {papernumber_cols}")
            # print(f"Columns present = {column_names}")
            # return False
            pass
        elif len(papernumber_cols) > 1:
            print("Multiple paper number columns - {papernumber_cols}")
            print(f"Columns present = {column_names}")
            return False

        return True


    @classmethod
    def print_classlist_warnings_errors(cls, warn_err: list[dict[str, Any]]) -> None:
        # separate into warn and err
        warn = [X for X in warn_err if X["warn_or_err"] == "warning"]
        err = [X for X in warn_err if X["warn_or_err"] != "warning"]
        # sort by line number
        warn.sort(key=lambda X: X["werr_line"])
        err.sort(key=lambda X: X["werr_line"])

        if warn:
            print("Classlist validation warnings:")
            for X in warn:
                print(f"\tline {X['werr_line']}: {X['werr_text']}")
        if err:
            print("Classlist validation errors:")
            for X in err:
                print(f"\tline {X['werr_line']}: {X['werr_text']}")