发布时间:2019-08-22 08:04:22编辑:auto阅读(2097)
一.安装pdfminer3k模块
二.读取pdf文件
import sys import importlib importlib.reload(sys) from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfinterp import PDFTextExtractionNotAllowed def readPDF(path, toPath): # 以二进制形式打开pdf文件 with open(path, "rb") as f: # 创建一个pdf文档分析器 parser = PDFParser(f) # 创建pdf文档 pdfFile = PDFDocument() # 链接分析器与文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) # 提供初始化密码 pdfFile.initialize() # 检测文档是否提供txt转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: # 解析数据 # 数据管理 manager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 解释器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理,每次处理一页 for page in pdfFile.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if(isinstance(x, LTTextBoxHorizontal)): with open(toPath, "a") as f: str = x.get_text() # print(str) f.write(str+"\n") path = r"G:\program\PycharmProjects\day06\3.读取pdf文件\文档.pdf" toPath = r"G:\program\PycharmProjects\day06\3.读取pdf文件\a.txt" readPDF(path, toPath)
上一篇: mac下安装配置python3.7
下一篇: python实现时间的比较
48870
47939
38718
35863
30288
27042
26075
20909
20714
19072
511°
609°
613°
616°
593°
577°
643°
717°
834°
936°