-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf2txt3.py
More file actions
74 lines (68 loc) · 2.15 KB
/
pdf2txt3.py
File metadata and controls
74 lines (68 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def extTxt(fname, outfile):
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
# outfile = None
outtype = None
imagewriter = None
rotation = 0
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
# '-d': debug += 1
laparams = LAParams()
laparams.line_margin = float(30)
laparams.word_margin = float(0.1)
# '-n': laparams = None
# '-A': laparams.all_texts = True
# '-V': laparams.detect_vertical = True
# '-M': laparams.char_margin = float(v)
# '-F': laparams.boxes_flow = float(v)
# '-Y': layoutmode = v
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
outtype = 'text'
# outfile = 'SAMPLE/Output.tmp'
outfp = file(outfile, 'w') if outfile else sys.stdout
outLns = ''
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
# fname = 'SAMPLE/sample.pdf'
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
# if __name__ == '__main__': sys.exit(main(sys.argv))