Source code for PDF4Cat.converter.any

import os
import io
# docx
from docx import Document
from docx.shared import Inches

# pptx
import collections 
import collections.abc
from pptx import Presentation
from pptx.util import Inches
# docx2html or docx2md
from mammoth import convert as mammoth_convert

from ..cat import PDF4Cat

[docs]class any_doc_convert(PDF4Cat): """Subclass of PDF4Cat parent class Args: doc_file (None, optional): Document file (for multiple operations, 'use input_doc_list') input_doc_list (list, optional): List of input docs passwd (str, optional): Document password (for crypt/decrypt) progress_callback (None, optional): Progress callback like: Raises: TypeError: If you use doc_file with input_doc_list (you can use only one) """ def __init__(self, *args, **kwargs): super(any_doc_convert, self).__init__(*args, **kwargs)
[docs] @PDF4Cat.run_in_subprocess # need add html to pdf def convert2pdf(self, output_pdf, use_soffice=False): # only for text based pdf and works not for all """Pdf to any (using PyMuPDF or Libre Office) Args: output_pdf (None, optional): Output pdf file use_soffice (bool, optional): Use Libre Office converter """ if isinstance(output_pdf, str): ext = os.path.splitext(self.doc_filename)[1][1:] if ext in ['docx', 'doc'] and not use_soffice: self.docx2pdf(output_pdf) return elif ext in ['png', 'jpg'] and not use_soffice: self.img2pdf(output_pdf) elif "."+ext in self.libre_exts and use_soffice: self.soffice_convert2pdf(output_pdf) return else: raise NotImplementedError(f"File extension '{self.doc_fileext}' => '.pdf' not supported") # elif ext == 'pptx' or ext == 'ppt': # self.pptx2pdf(output_pdf) # return doc = self.pdf_open(self.doc_file) b = doc.convert_to_pdf() # convert to pdf pdf = self.pdf_open("pdf", b) # open as pdf # *** # toc = doc.het_toc() # table of contents of input # pdf.set_toc(toc) # simply set it for output # meta = doc.metadata # read and set metadata # if not meta["producer"]: # meta["producer"] = "PDF4Cat https://github.com/BlackCatDevel0per/PDF4Cat" # if not meta["creator"]: # meta["creator"] = "PDF4Cat pdf tool" # meta["modDate"] = self.fitz_get_pdf_now() # meta["creationDate"] = meta["modDate"] # pdf.set_metadata(meta) # *** # now process the links link_cnti = 0 link_skip = 0 for pinput in doc: # iterate through input pages links = pinput.get_links() # get list of links link_cnti += len(links) # count how many pout = pdf[pinput.number] # read corresp. output page for l in links: # iterate though the links if l["kind"] == self.fitz_LINK_NAMED: # we do not handle named links print("named link page", pinput.number, l) link_skip += 1 # count them continue pout.insert_link(l) # simply output the others # save the conversion result pdf.save(output_pdf, garbage=4, deflate=True) # say how many named links we skipped if link_cnti > 0: print("Skipped %i named links of a total of %i in input." % (link_skip, link_cnti))
# Generate name with BytesIO object (it is faster)
[docs] def gen_images4conv(self, pdf) -> bytes: """Generator, generate BytesIO object Args: pdf (None, optional): pdf object (PDF4Cat.open) Yields: bytes: BytesIO """ noOfPages = range(pdf.page_count) for pageNo in noOfPages: io_data = io.BytesIO() # page = pdf.load_page(pageNo) #number of page pix = page.get_pixmap() del page io_data.write(pix.tobytes(output="png")) del pix # imfi = io_data yield imfi
[docs] @PDF4Cat.run_in_subprocess def pdf2pptx(self, output_pptx, A4=True): """Pdf to pptx (using PyMuPDF) Args: output_pptx (None, optional): Output pptx file A4 (bool, optional): Use Inches for A4 page """ if not output_pptx: output_pptx = os.path.join(self.doc_path, self.doc_name+"_out.pdf") output_pptx = os.path.join(os.getcwd(), output_pptx) pdf = self.pdf_open(self.doc_file, passwd=self.passwd) prs = Presentation() w, h = 13.333, 7.5 if A4: prs.slide_height=Inches(11) prs.slide_width=Inches(8.5) w, h = 8.5, 11 blank_slide_layout = prs.slide_layouts[6] for io_data in self.gen_images4conv(pdf): slide = prs.slides.add_slide(blank_slide_layout) # slide.shapes.add_picture(io_data, 0, 0, width=Inches(13.333), height=Inches(7.5)) slide.shapes.add_picture(io_data, 0, 0, width=Inches(w), height=Inches(h)) del io_data self.counter += 1 #need enumerate self.progress_callback(self.counter, pdf.page_count) prs.save(output_pptx) self.counter = 0
[docs] @PDF4Cat.run_in_subprocess def pdf2docx(self, output_docx): """Pdf to docx (using PyMuPDF) Args: output_docx (None, optional): Output docx file """ if not output_docx: output_docx = os.path.join(self.doc_path, self.doc_name+"_out.pdf") output_docx = os.path.join(os.getcwd(), output_docx) pdf = self.pdf_open(self.doc_file, passwd=self.passwd) document = Document() for io_data in self.gen_images4conv(pdf): # document.add_picture(io_data, width=Inches(8.5), height=Inches(11)) document.add_picture(io_data, width=Inches(5.7), height=Inches(9)) # 5.75 del io_data self.counter += 1 #need enumerate self.progress_callback(self.counter, pdf.page_count) document.save(output_docx) self.counter = 0
[docs] @PDF4Cat.run_in_subprocess def docx2html(self, output_doc, style_map = None): """docx to html (using PyMuPDF) Args: output_html (None, optional): Output html file """ if not output_doc: output_doc = os.path.join(self.doc_path, self.doc_name+"_out.pdf") if isinstance(output_doc, str): output_doc = os.path.join(os.getcwd(), output_doc) if not style_map: style_map = None else: with open(style_map) as style_map_fileobj: style_map = style_map_fileobj.read() with open(self.doc_file, "rb") as docx_file: result = mammoth_convert( docx_file, style_map=style_map, output_format='html', # markdown have too ) # *** if not isinstance(output_doc, str): # need os like path.. output_doc.write(result.value.encode()) return # *** with open(output_doc, "wb") as output: output.write(result.value.encode())
[docs] @PDF4Cat.run_in_subprocess def docx2pdf(self, output_pdf): """docx to pdf (using PyMuPDF [docx=>html=>pdf]) Args: output_pdf (None, optional): Output pdf file """ if not output_pdf: output_pdf = os.path.join(self.doc_path, self.doc_name+"_out.pdf") output_pdf = os.path.join(os.getcwd(), output_pdf) html_tmp = io.BytesIO() self.docx2html(html_tmp, run_in_subprocess=False) doc = self.pdf_open(filename="html", stream=html_tmp) pdfbytes = doc.convert_to_pdf() # need convert to image..? doc.close() with open(output_pdf, 'wb') as output: output.write(pdfbytes)
# pdf = self.pdf_open("pdf", pdfbytes) # pdf.save(output_pdf) # pdf.close()