Source code for plom.scan.scansToImages

# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2018-2020 Andrew Rechnitzer
# Copyright (C) 2018 Elvis Cai
# Copyright (C) 2019-2024 Colin B. Macdonald
# Copyright (C) 2020 Victoria Schuster
# Copyright (C) 2020 Andreas Buttenschoen

import logging
from pathlib import Path
import shutil
import struct
import subprocess
from multiprocessing import Pool
import random
from warnings import warn
import uuid

from tqdm import tqdm
import exif
import fitz
import PIL
import PIL.ExifTags
import PIL.PngImagePlugin

from plom import __version__
from plom import PlomImageExts
from plom import ScenePixelHeight
from plom.scan.bundle_utils import make_bundle_dir
from plom.scan.rotate import pil_load_with_jpeg_exif_rot_applied


log = logging.getLogger("scan")


def _generate_metadata(bundle_name, bundle_page):
    """Generate new metadata dict for a bitmap."""
    return {
        "PlomVersion": __version__,
        "SourceBundle": str(bundle_name),
        "SourceBundlePosition": str(bundle_page),
        "RandomUUID": str(uuid.uuid4()),
    }


def generate_metadata_str(bundle_name, bundle_page):
    """Generate new metadata for a bitmap as a string."""
    return " ".join(
        f"{k}:{v};" for k, v in _generate_metadata(bundle_name, bundle_page).items()
    )


def generate_png_metadata(bundle_name, bundle_page):
    """Generate new metadata for a bitmap."""
    metadata = PIL.PngImagePlugin.PngInfo()
    for k, v in _generate_metadata(bundle_name, bundle_page).items():
        metadata.add_text(k, v)
    return metadata


def add_metadata_png(filename, bundle_name, bundle_page):
    """Insert metadata into an existing png file.

    Args:
        filename (pathlib.Path/str): name of a png file to edit.
        bundle_name (str): usually the filename of the bundle.
        bundle_page (int): what page of the bundle.

    Returns:
        None

    This is used to write some unique metadata into the PNG file,
    originally to avoid Issue #1573.
    """
    img = PIL.Image.open(filename)
    metadata = generate_png_metadata(bundle_name, bundle_page)
    img.save(filename, pnginfo=metadata)


def add_metadata_jpeg_exif(filename, bundle_name, bundle_page):
    """Insert metadata into an existing jpeg file, via EXIF fields.

    Raises:
        ValueError: known to fail if existing file has a shorter
            ``user_comment`` field.
    """
    im_shell = exif.Image(filename)
    im_shell.set("user_comment", generate_metadata_str(bundle_name, bundle_page))
    with open(filename, "wb") as f:
        f.write(im_shell.get_file())


def add_metadata_jpeg_comment(filename, bundle_name, bundle_page):
    """Insert metadata into an existing jpeg file, by appending comment.

    Args:
        filename (pathlib.Path/str): name of a jpeg file to edit.
        bundle_name (str): usually the filename of the bundle.
        bundle_page (int): what page of the bundle.

    Returns:
        None

    This is used to write some unique metadata into the JPEG file,
    originally to avoid Issue #1573.

    We just append some data onto the end of the file.  As long as it
    starts with the particular byte sequence ``ff fe``, then its a
    comment.  Hat-tip:
    https://stackoverflow.com/questions/8283798/adding-a-comment-to-a-jpeg-file-using-python

    You might prefer writing comments to EXIF.  However, this idea is fast and
    safe (?).  Note: we don't put the comment *before* the EOF marker which is
    non-standard: e.g., ``rdjpgcom`` command-line tool cannot read.
    """
    s = generate_metadata_str(bundle_name, bundle_page)
    bs = s.encode()
    # start of comment
    b = b"\xff\xfe"
    # 2 bytes, unsigned int, little-endian
    b += struct.pack(">H", len(bs))
    # trailing null
    b += bs + b"\x00"
    with open(filename, "a+b") as f:
        f.write(b)


[docs]def processFileToBitmaps(
    file_name, dest, *, do_not_extract=False, debug_jpeg=False, add_metadata=True
):
    """Extract/convert each page of pdf into bitmap.

    We have various ways to do this, in rough order of preference:

    1. Extract a scanned bitmap "as-is"
    2. Render the page with PyMuPDF
    3. Render the page with Ghostscript

    The bitmaps will have some metadata written into them to prevent
    otherwise identical pages from producing images with identical
    hashes.  See Issue #1573.

    Args:
        file_name (str, Path): PDF file from which to extract bitmaps.
        dest (str, Path): where to save the resulting bitmap files.
            Must exist.

    Keyword Args:
        do_not_extract (bool): always render, do no extract even if
            it seems possible to do so.  This is off-by-default until
            we are confident extracting won't miss anything.
            See more detailed description in the user-facing command-line
            tool `plom-scan`.
        debug_jpeg (bool): make jpegs, randomly rotated of various
            quality settings, for debugging or demos.  Default: False.
        add_metadata (bool): add invisible metadata to each image
            including bundle name and random numbers.  Default: True.
            If you disable this, you can get two identical images
            (from different pages) giving identical hashes, which
            in theory is harmless but at least in 2022 was causing
            database/client issues.

    Returns:
        list: an list of the images of each page, ordered as in the input
        file.  Each entry is a `pathlib.Path`.

    Raises:
        RuntimeError: not a PDF and not something PyMuPDF can open.
        TypeError: not a PDF, but it can be opened by PyMuPDF.
        ValueError: unrealistically tall skinny or very wide pages.

    For extracting the scanned data as is, we must be careful not to
    just grab any image off the page (for example, it must be the only
    image on the page, and it must not have any annotations on top of
    it).  There are various other conditions; if any of them fail, we
    fall back on rendering with PyMuPDF.

    If the above fail, we fall back on calling Ghostscript as a
    subprocess (the `gs` binary).  TODO: NOT IMPLEMENTED YET.
    """
    dest = Path(dest)

    # issue #126 - replace spaces in names with underscores for output names.
    safeScan = Path(file_name).stem.replace(" ", "_")

    with fitz.open(file_name) as doc:
        if not doc.is_pdf:
            raise TypeError("This does not appear to be a PDF file")
        if doc.is_repaired:
            warn("PyMuPDF had to repair this PDF: perhaps it is damaged in some way?")

        files = []
        for p in doc:
            basename = f"{safeScan}-{(p.number + 1):05}"
            outname, msgs = try_to_extract_image(
                p,
                doc,
                dest,
                basename,
                file_name,
                do_not_extract=do_not_extract,
                add_metadata=add_metadata,
            )
            if outname is not None:
                files.append(outname)
                continue
            log.info(f"{basename}: Fitz render. No extract b/c: " + "; ".join(msgs))
            outname = render_page_to_bitmap(
                p,
                dest,
                basename,
                file_name,
                add_metadata=add_metadata,
            )
            # For testing, randomly make jpegs, rotated a bit, of various qualities
            if debug_jpeg and random.uniform(0, 1) <= 0.5:
                _ = make_mucked_up_jpeg(outname, dest / ("muck-" + basename + ".jpg"))
                outname.unlink()
                outname = _
            files.append(outname)
        assert len(files) == len(doc), "Expected one image per page"
    return files


[docs]def try_to_extract_image(
    p, doc, dest, basename, bundle_name, *, do_not_extract=False, add_metadata=True
):
    """If possible/desirable, extract an image from a PDF page and save to disc.

    "Desirable" means there are no additional markings on the page; no
    information will be lost by looking only at the extracted image
    instead of the original page.

    Args:
        p (fitz.Page):
        doc (fitz.Document):
        dest (pathlib.Path): where to save the resulting bitmap file.
        basename (str):
        bundle_name (str/pathlib.Path): only used for metadata hackery
            uniqifying pages, you can pass whatever you want.

    Keyword Args:
        do_not_extract (bool): always render, do no extract even if
            it seems possible to do so.  This is off-by-default until
            we are confident extracting won't miss anything.
            See more detailed description in the user-facing command-line
            tool `plom-scan`.
        add_metadata (bool): add invisible metadata to each image
            including bundle name and random numbers.  Default: True.
            If you disable this, you can get two identical images
            (from different pages) giving identical hashes, which
            in theory is harmless but at least in 2022 was causing
            database/client issues.

    Returns:
        2-tuple: first entry is ``pathlib.Path`` or ``None``, where
        ``None`` means we could not (or chose not) to extract.
        Whereas a `Path` means we have extracted the image.
        The second return value is ``msgs`` a list of strings, which
        give semi-user-readable info about why we cannot/choose not
        to extract.
    """
    msgs = []
    # Any of these might indicate something more complicated than a scan
    # and hence we should be safe and just render the page.  We only try to
    # extract the bitmap under very conservative circumstances.  It is not
    # safe to assume that if there is a single image on the current page
    # then that is the scan - e.g., student annotates pdf using xournalpp and
    # then stamps a smiley-face `.png` there."
    if p.get_links():
        msgs.append("Has links")
    for _ in p.annots():
        msgs.append("Has annotations")
        break
    for _ in p.widgets():
        msgs.append("Has fillable forms")
        break
    # TODO: which is more expensive, this or getImageList?
    if p.get_text("text"):
        msgs.append("Has text")

    # TODO: Do later to get more info in prep for future change to default
    if do_not_extract:
        msgs.append("Disabled by flag")

    if msgs:
        return None, msgs

    r, d = extractImageFromFitzPage(p, doc)
    if not r:
        msgs.append(d)
        return None, msgs
    log.info(
        '%s: Extracted "%s" from single-image page %sx%s',
        basename,
        d["ext"],
        d["width"],
        d["height"],
    )
    if d["ext"].lower() not in PlomImageExts:
        # Issue #2346: could try to convert to png, but for now just let fitz render
        log.info(f"  {d['ext']} not in allowlist: leave for fitz render")
        msgs.append(f'extracted image is not {", ".join(PlomImageExts)}')
        return None, msgs
    outname = dest / (basename + "." + d["ext"])
    with open(outname, "wb") as f:
        f.write(d["image"])
    if add_metadata:
        # watermark for Issue #1573
        if d["ext"].lower() == "png":
            add_metadata_png(outname, bundle_name, p.number)
        elif d["ext"].lower() in ("jpeg", "jpg"):
            # We write some unique metadata into the JPEG file.  We could
            # use the EXIF data or a JPEG comment.  The latter seems safer
            # as we just append some bytes to the file...?  I'm concerned
            # about interactions with existing EXIF: for example `exif`
            # library cannot write longer "user_comment" field (see tests).
            add_metadata_jpeg_comment(outname, bundle_name, p.number)
            # add_metadata_jpeg_exif(outname, bundle_name, p.number)
        else:
            # there should be no other choice until PlomImageExts is updated
            raise ValueError(f"No support for watermarking \"{d['ext']}\" files")
    return outname, msgs


[docs]def render_page_to_bitmap(
    p, dest, basename, bundle_name, debug_jpeg=False, add_metadata=True
):
    """Use PyMuPDF to render a PDF page to an image.

    Args:
        p (fitz.Page):
        dest (pathlib.Path): where to save the resulting bitmap file.
        basename (str):
        bundle_name (str/pathlib.Path): only used for metadata hackery
            uniqifying pages, you can pass whatever you want.

    Keyword Args:
        add_metadata (bool): add invisible metadata to each image
            including bundle name and random numbers.  Default: True.
            If you disable this, you can get two identical images
            (from different pages) giving identical hashes, which
            in theory is harmless but at least in 2022 was causing
            database/client issues.

    Returns:
        pathlib.Path: the rendered image on disc.

    Raises:
        ValueError: overly weird shapes such as too tall ("Safeway receipt")
            or two wide ("fortune cookie").
    """
    aspect = p.mediabox_size[0] / p.mediabox_size[1]
    H = ScenePixelHeight
    W = H * aspect
    MINWIDTH = 1024
    MAXHEIGHT = 15999
    MAXWIDTH = 3 * ScenePixelHeight // 2
    assert MINWIDTH < ScenePixelHeight
    # Note logic not same between tall and wide:
    #   * tall: "Safeway receipt", observed from "infinite paper" software
    #   * wide: "fortune cookie", little strip cropped from regular sheet
    # In the tall case, we use extra pixels vertically because there is
    # actually more to resolve.  But I've never seen a wide case that was
    # wider than a landscape sheet of paper.  Also, currently, Client's
    # would display such a thin wide strip at too large a scale.
    if aspect > 1:
        if W > MAXWIDTH:
            # TODO: warn of extreme aspect ratio?  Flag to control this?
            W = MAXWIDTH
            H = W / aspect
            if H < 100:
                raise ValueError("Scanned a strip too wide and thin?")
    else:
        if W < MINWIDTH:
            W = MINWIDTH
            H = W / aspect
            if H > MAXHEIGHT:
                H = MAXHEIGHT
                W = H * aspect
                if W < 100:
                    raise ValueError("Scanned a long strip of thin paper?")

    # fitz uses ceil (not round) so decrease a little bit
    if W > H:
        z = (float(W) - 0.0001) / p.mediabox_size[0]
    else:
        z = (float(H) - 0.0001) / p.mediabox_size[1]
    # # For testing, choose widely varying random sizes
    # z = random.uniform(1, 5)
    log.info(f"{basename}: Fitz render z={z:4.2f}.")
    pix = p.get_pixmap(matrix=fitz.Matrix(z, z), annots=True)
    # TODO: sometimes width and height get mixed up: Issues #1148, #1935
    # but one of them should match the target, without worrying which is which
    if not (pix.width in (W, H) or pix.height in (W, H)):
        _m = (
            f"Debug: {p}: some kind of rounding error in scaling image?"
            f" Rendered to {pix.width}x{pix.height} from target {W}x{H}"
        )
        warn(_m)
        log.warning(_m)

    pngname = dest / (basename + ".png")
    jpgname = dest / (basename + ".jpg")
    if add_metadata:
        # We write some unique metadata into the PNG file to avoid Issue #1573
        metadata = generate_png_metadata(bundle_name, p.number)
        pix.pil_save(pngname, optimize=True, pnginfo=metadata)
    else:
        # pil_save 10% smaller but 2x-3x slower, Issue #1866
        pix.save(pngname)

    exy = PIL.Image.Exif()  # empty exif data
    if add_metadata:
        # We write some unique metadata into the JPEG exif data to avoid Issue #1573
        assert PIL.ExifTags.TAGS[37510] == "UserComment"
        exy[37510] = generate_metadata_str(bundle_name, p.number)
    # TODO: add progressive=True?
    # Note subsampling off to avoid mucking with red hairlines
    pix.pil_save(jpgname, quality=90, optimize=True, subsampling=0, exif=exy)

    # Keep the jpeg if its at least a little smaller
    if jpgname.stat().st_size < 0.9 * pngname.stat().st_size:
        pngname.unlink()
        return jpgname
    jpgname.unlink()
    return pngname
    # WebP here is also an option, Issue #1864.


def make_mucked_up_jpeg(f: Path, outname: Path) -> Path:
    """Given an input file, do horrid things to it in the name of debugging.

    Args:
        f: input
        outname: output file to be created.

    Returns:
        The output file again.
    """
    img = pil_load_with_jpeg_exif_rot_applied(f)

    angle = random.choice([90.5, 180.4, -90.3, -88, -1])
    msgs = [f"hard-rotate {angle}"]
    try:
        bilinear = PIL.Image.Resampling.BILINEAR
    except AttributeError:
        # Remove this workaround once minimum Pillow is 9.1.x
        # pylint: disable=no-member
        bilinear = PIL.Image.BILINEAR  # type: ignore
    img = img.rotate(
        angle,
        resample=bilinear,
        expand=True,
        fillcolor=(128, 128, 128, 0),
    )
    quality = random.choice([6, 30, 94, 94, 94])
    msgs.append(f"quality {quality}")
    r = random.choice([None, None, None, 3, 6, 8])
    if r:
        msgs.append(f"exif rotate {r}")
    log.info("  Randomly making jpeg " + ", ".join(msgs))
    img.save(outname, "JPEG", quality=quality, optimize=True)
    im_shell = exif.Image(outname)
    # debugging so maybe we don't need unique JPEG exif metadata for Issue #1573
    # im_shell.set("user_comment", generate_metadata_str(bundle_name, p.number))
    if r:
        im_shell.set("orientation", r)
    # TODO: MyPy seems concerned with these lines
    with open(outname, "wb") as f:  # type: ignore
        f.write(im_shell.get_file())  # type: ignore
    # add_metadata_jpeg_comment(outname, file_name, p.number)
    return outname


def extractImageFromFitzPage(page, doc):
    """Extract a single image from a fitz page or return False.

    Args:
        page: a page of a fitz document.
        doc: fitz doc containing `page`.

    Returns:
        True/False: whether this page contains nothing but a single image
        msg or dict: if False, a msg about what happened, if True a dict
            The dict has at least the fields `width`, `height`, `image`
            and `ext`.  `d["image"]` is the raw binary data.
    """
    imlist = page.get_images()
    if len(imlist) > 1:
        return False, "More than one image"
    if len(imlist) == 0:
        return False, "Image List is Empty"

    d = doc.extract_image(imlist[0][0])
    # TODO: log.debug this:
    # print("  " + "; ".join(["{}: {}".format(k, v) for k, v in d.items() if not k == "image"]))
    width = d.get("width")
    height = d.get("height")
    if not (width and height):
        return False, "Extracted, but no size information"

    if width < 600 or height < 800:
        # TODO: log.warn?  Rendering unlikely to help
        # unless its a small image centered on a big page
        return False, "Extracted, but below minimum size"

    if d["smask"] != 0:
        return False, "Extracted, but had some kind of mask"

    return True, d


def processFileToPng_w_ghostscript(fname, dest):
    """Convert each page of pdf into png using ghostscript."""
    # issue #126 - replace spaces in names with underscores for output names.
    safeScan = Path(fname).stem.replace(" ", "_")
    dest = Path(dest)
    try:
        subprocess.run(
            [
                "gs",
                "-dNumRenderingThreads=4",
                "-dNOPAUSE",
                "-sDEVICE=png256",
                "-o",
                dest / (safeScan + "-%d.png"),
                "-r200",
                fname,
            ],
            stderr=subprocess.STDOUT,
            shell=False,
            check=True,
        )
    except subprocess.CalledProcessError as suberror:
        print("Error running gs: {}".format(suberror.stdout.decode("utf-8")))


# TODO: for debugging, can replace with the older ghostscript
# processFileToBitmaps = processFileToPng_w_ghostscript


def gamma_adjust(fn):
    """Apply a simple gamma shift to an image."""
    subprocess.run(
        ["mogrify", "-quiet", "-gamma", "0.5", fn],
        stderr=subprocess.STDOUT,
        shell=False,
        check=True,
    )


def postProcessing(thedir, dest, skip_gamma=False):
    """Do post processing on a directory of scanned bitmaps.

    Args:
        thedir (str, Path): a directory full of bitmaps.
        dest (str, Path): move images here (???).
        skip_gamma_shift (bool): skip the white balancing.

    Returns:
        None
    """
    thedir = Path(thedir)
    dest = Path(dest)

    if not skip_gamma:
        # TODO: maybe tiff as well?  Not jpeg: not anything lossy!
        print("Gamma shift the PNG images")
        # list and len bit crude here: more pythonic to leave as iterator?
        stuff = list(thedir.glob("*.png"))
        N = len(stuff)
        with Pool() as p:
            _ = list(tqdm(p.imap_unordered(gamma_adjust, stuff), total=N))
        # Pool does this loop, but in parallel
        # for x in glob.glob("..."):
        #     gamma_adjust(x)

    fileList = []
    for ext in PlomImageExts:
        fileList.extend(thedir.glob(f"*.{ext}"))
    # move them to pageimages for barcode reading
    for file in fileList:
        shutil.move(file, dest / file.name)


def process_scans(
    pdf_fname, bundle_dir, skip_gamma=False, skip_img_extract=False, *, demo=False
):
    """Process a pdf file into bitmap images of each page.

    Process each page of a pdf file into bitmaps.

    Do a small amount of post-processing when possible to do losslessly
    (e.g., png).  A simple gamma shift to leave white-white but make
    everything else darker.  Improves images when students write in very
    light pencil.

    Args:
        pdf_fname (str, pathlib.Path): the path to a PDF file.  Used to
            access the file itself.  TODO: is the filename also used for
            anything else by code called by this function?
        bundle_dir (pathlib.Path): the filesystem path to the bundle,
            either as an absolute path or relative the CWD.
        skip_gamma (bool): skip white balancing in post processing.
        skip_img_extract (bool): don't try to extract raw images, just
            render each page.  If `False`, images still may not be
            extracted: there are a variety of sanity checks that must
            pass.

    Keyword Args:
        demo (bool): Simulate scanning with random rotations, adding
            noise, lower-quality jpegs, etc.  Default: False

    Returns:
        list: filenames (`pathlib.Path`) in page order, one for each page.
        The same files will be in the directory specified by `bundle_dir`.
        We do not add any other files to that directory.
    """
    make_bundle_dir(bundle_dir)
    bitmaps_dir = bundle_dir / "scanPNGs"
    files = processFileToBitmaps(
        pdf_fname,
        bitmaps_dir,
        do_not_extract=skip_img_extract,
        debug_jpeg=demo,
    )
    # TODO: if not skip_gamma, this might clear our image uniqifier (#1573)
    postProcessing(bitmaps_dir, bundle_dir / "pageImages", skip_gamma)
    #           ,,,
    #          (o o)
    # -----ooO--(_)--Ooo------
    # hacky myhacker was here!
    # (instead we could rethink postProcessing)
    files = [bundle_dir / "pageImages" / f.name for f in files]
    return files