Source code for PDF4Cat.converter.ocr

import os
import io

from ..cat import PDF4Cat

[docs]class OCR(PDF4Cat): """Subclass of PDF4Cat parent class Args: doc_file (None, optional): Document file (for multiple operations, 'use input_doc_list') input_doc_list (list, optional): List of input docs passwd (str, optional): Document password (for crypt/decrypt) progress_callback (None, optional): Progress callback like: Raises: TypeError: If you use doc_file with input_doc_list (you can use only one) """ def __init__(self, *args, **kwargs): super(OCR, self).__init__(*args, **kwargs) # (it is faster)
[docs] def gen_pdfImagesOCR(self, pages: list = [], language: str = 'eng', zoom: float = 1.5) -> tuple: """Generator, generate BytesIO object Args: pages (list, optional): List of pages to select like [1, 3, 5, 15] language (str, optional): Language to ocr (look fitz.pdfocr_tobytes) zoom (float, optional): Zoom image (look fitz.Matrix docs) Yields: tuple: BytesIO """ pdf = self.pdf_open(self.doc_file, passwd=self.passwd) mat = self.fitz_Matrix(zoom, zoom) noOfPages = range(pdf.page_count) if pages: noOfPages = pages for pageNo in noOfPages: if pages and pageNo not in pages: continue io_data = io.BytesIO() # page = pdf.load_page(pageNo) #number of page pix = page.get_pixmap(matrix = mat) bytes_ = pix.pdfocr_tobytes(language=language) # yield bytes_
[docs] @PDF4Cat.run_in_subprocess def pdfocr(self, language: str = 'eng', output_pdf = None, pages: list = [], start_from: int = 0, zoom: float = 1.5) -> None: """OCR pdf to file Args: language (str, optional): Language to ocr (look fitz.pdfocr_tobytes) output_pdf (None, optional): Output pdf file pages (list, optional): List of pages to select like [1, 3, 5, 15] start_from (int, optional): Enumerate from n zoom (float, optional): Zoom image (look fitz.Matrix docs) """ if not output_pdf: output_pdf = os.path.join(self.doc_path, self.doc_name+"_out.pdf") output_pdf = os.path.join(os.getcwd(), output_pdf) #doc.insert_pdf(imgpdf) pdf_doc = self.pdf_open(self.doc_file) pdf = self.pdf_open() if not pages: pcount = pdf_doc.page_count else: pcount = len(pages) for bytes_ in self.gen_pdfImagesOCR(pages, language, zoom): with self.pdf_open("pdf", bytes_) as ocr_processed: pdf.insert_pdf(ocr_processed) self.counter += 1 #need enumerate self.progress_callback(self.counter, pcount) pdf.save(output_pdf) self.counter = 0