import re
import pdfplumber
import pandas as pd
from collections import namedtuple
path = r"C:UsersxDocumentsPython ScriptsFilesx.pdf"
Line = namedtuple('Line', 'print_date order_no pos item_no issue qty UM price req_date line_amt')
with pdfp.open(path) as pdf:
page = pdf.pages[2]
text = page.extract_text()
new_vend_re = re.compile(r'^d{1,}s[A-Z].*')
for line in text.split('n'):
if new_vend_re.match(line):
print(line)
查找并打印如下内容:
53 AB839-11 0002 31.00 EA 58.5300 1814.43
有很多页面需要提取类似的值。这只是一个例子。剩余的处理代码:
line_items = []
with pdfplumber.open(path) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('n'):
line = new_vend_re.search(line)
if line:
pos = line.group(1)
item_no = line.group(2)
issue = line.group(3)
qty = line.group(4)
UM = line.group(5)
price = line.group(6)
amt = line.group(7)
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
df = pd.DataFrame(line_items)
df.head()
我已经得到了这个代码,但它似乎无法将提取的数据放入各自的元组。我的程序基本上应该迭代PDF,它有多个页面,并精确地从regex表达式中提取的各种项中获得值,并将它们放入元组中,但我的代码由于某种原因不起作用。
你的正则表达式是错误的-它以"^d+"
开头-意思是行开始后跟数字。文件中的行以"(......)"
开头-更改正则表达式:
import re
from collections import namedtuple
Inv = namedtuple('Inv', 'pos, item_no, issue, qty, UM, price, amt')
new_vend_re = re.compile(r'd+s[A-Z].*')
text = "somenmore (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things ntextn"
line_items = []
for line in text.split('n'):
searched = new_vend_re.search(line)
if searched:
print(line)
# get the matched part of the line and remove ( ) from start/end
m = searched.group(0).strip("()")
# now its as simple as splitting it into variables
pos, item_no, issue, qty, UM, price, amt, *crap = m.split()
# and use a namedtuple that works with that amount of data
line_items.append(Inv(pos, item_no, issue, qty, UM, price, amt))
if crap:
print(crap, "that was also captured but not used")
print(*line_items)
import pandas as pd
df = pd.DataFrame(line_items)
print(df.head())
输出:
# line
more (53 AB839-11 0002 31.00 EA 58.5300 1814.43) things
# crap catchall
['things'] that was also captured but not used
# named tuple
Inv(pos='53', item_no='AB839-11', issue='0002', qty='31.00', UM='EA', price='58.5300', amt='1814.43)')
# df
pos item_no issue qty UM price amt
0 53 AB839-11 0002 31.00 EA 58.5300 1814.43)