PDF文字发票信息提取重命名软件源码

基于python+tkinter 写的桌面软件,可以免费使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
import tkinter as tk
from tkinter import filedialog, messagebox, ttk, Toplevel, StringVar, Checkbutton, Button
import logging
import pdfplumber
import re
import os
import subprocess
import xlwt
pdf_files_folder = None

# 配置日志记录
logging.basicConfig(filename='app.log',
filemode='w',
format='%(name)s - %(levelname)s - %(message)s',
level=logging.INFO)

reverse = False
#定义排序函数
def sorter(tree, column, data_type, reverse):
l = [(tree.set(k, column), k) for k in tree.get_children('')]
if data_type == 'num':
try:
l = [(float(x), k) for x, k in l]
except ValueError as e:
pass
l.sort(reverse=reverse)

for index, (val, k) in enumerate(l):
tree.move(k, '', index)

def column_sorter(tree, column, data_type='str'):
global reverse
reverse = not reverse
sorter(tree, column, data_type, reverse)
def read_pdf_content(file_path):
logging.info('开始函数 read_pdf_content')
with pdfplumber.open(file_path) as pdf:
full_text = "".join(page.extract_text() for page in pdf.pages)
invoice_number = re.findall(r'(?:发票号码)\s*[::]*\s*([^\s]+)', full_text)
name = re.findall(r'(?:名称|名\s{1}称|名\s{1}称\s{1}:)\s*[::]*\s*([^\s]+)', full_text)
date = re.findall(r'\d\s*\d\s*\d\s*\d\s*年\s*\d\s*\d?\s*月\s*\d\s*\d?\s*日', full_text)
category = re.findall(r'\*([\u4e00-\u9fa5a-zA-Z]+)\*([\u4e00-\u9fa5a-zA-Z]+)', full_text)
logging.info('结束函数 read_pdf_content')
return {
"text": full_text,
"invoice_number": invoice_number[0] if invoice_number else "",
"name": name[0] if name else "",
"date": date[0] if date else "",
"category": category if category else []
}

def get_pdf_files(pdf_dir):
logging.info('Starting function get_pdf_files')
pdf_files = []
for root, dirs, files in os.walk(pdf_dir):
for file in files:
if file.endswith(".pdf"):
# 在此处对文件路径进行处理,确保路径格式正确
filepath = os.path.normpath(os.path.join(root, file))
pdf_files.append(filepath)
logging.info(f'Finished function get_pdf_files. Found {len(pdf_files)} pdf files.')
return pdf_files

def rename_pdf_file(file_path, new_value):
logging.info('Starting function rename_pdf_file')
dir_path = os.path.dirname(file_path)
new_file_name = f"{new_value}.pdf"
new_file_path = os.path.join(dir_path, new_file_name)
os.rename(file_path, new_file_path)
logging.info(f'Finished function rename_pdf_file. File {file_path} renamed to {new_file_path}')
return new_file_path

# Your remaining function definitions and script body here...
# Don't forget to add logging statements to the rest of your functions and to your script body.

def sort_amount(tree, prev_sort=None):
if prev_sort is None:
prev_sort = {"column": "Amount", "reverse": False}

tree_items = tree.get_children()
reverse = not prev_sort["reverse"] if prev_sort["column"] == "Amount" else False

# 排序并更新序号
sorted_items = sorted(tree_items, key=lambda item: float(tree.set(item, "Amount")), reverse=reverse)
for index, item in enumerate(sorted_items, start=1):
tree.move(item, '', index - 1)
tree.item(item, values=(index, *tree.item(item, "values")[1:]))

prev_sort["column"] = "Amount"
prev_sort["reverse"] = reverse
return prev_sort

def open_pdf(path):
if os.name == 'nt': # For Windows
os.startfile(path)
else: # For MacOS and Linux
opener = 'open' if os.name == 'posix' else 'xdg-open'
subprocess.call([opener, path])

def display_results(values, total_amount, input_root):
# 输入界面的根窗口销毁
input_root.destroy()

# 创建新的根窗口
root = tk.Tk()
root.title("发票金额统计")
center_window(root, width=950, height=300)

# 主框架设置
main_frame = ttk.Frame(root, padding="10 10 10 10")
main_frame.grid(column=0, row=0, sticky=(tk.W, tk.E, tk.N, tk.S))
main_frame.columnconfigure(0, weight=1)
main_frame.rowconfigure(0, weight=1)

# 定义重命名选中文件的功能
def rename_selected_files():
from tkinter import Toplevel, StringVar, Checkbutton, Button

selected_items = tree.selection()
if not selected_items:
messagebox.showerror("错误", "请先选择一个或多个PDF文件进行重命名")
return

# 创建对话框并注册变量
dialog = Toplevel(root)
dialog.title("选择需要的字段")

include_amount = StringVar(value='no')
include_category = StringVar(value='no')
include_invoice_number = StringVar(value='no')
include_name = StringVar(value='no')

Checkbutton(dialog, text='金额', variable=include_amount, onvalue='yes', offvalue='no', anchor='w').pack(fill='x')
Checkbutton(dialog, text='类别', variable=include_category, onvalue='yes', offvalue='no', anchor='w').pack(fill='x')
Checkbutton(dialog, text='发票号码', variable=include_invoice_number, onvalue='yes', offvalue='no', anchor='w').pack(fill='x')
Checkbutton(dialog, text='公司名称', variable=include_name, onvalue='yes', offvalue='no', anchor='w').pack(fill='x')


def on_ok():
# 对话框确定按钮的回调函数
for item in selected_items:
print(f"Processing item: {item}")
item_values = tree.item(item, 'values')
new_name_parts = []
if include_amount.get() == 'yes':
new_name_parts.append(item_values[1])
if include_category.get() == 'yes':
new_name_parts.append(item_values[5]) # Category
if include_invoice_number.get() == 'yes':
new_name_parts.append(item_values[2]) # Invoice Number
if include_name.get() == 'yes':
new_name_parts.append(item_values[3]) # Name

if not new_name_parts: # 如果用户没有选择任何字段,显示错误信息
messagebox.showerror("错误", "至少需要选择一个字段来重命名文件。")
dialog.destroy()
return

new_file_name = "_".join(new_name_parts)
current_file_path = item_values[6] # 原始的pdf路径
new_file_path = rename_pdf_file(current_file_path, new_file_name)
tree.set(item, column="文件路径", value=new_file_path)

messagebox.showinfo("完成", "选中的文件已重命名。")
dialog.destroy()

Button(dialog, text='确定', command=on_ok).pack()

dialog.update_idletasks()
width = dialog.winfo_reqwidth()
height = dialog.winfo_reqheight()
x = (root.winfo_screenwidth() // 2) - (width // 2)
y = (root.winfo_screenheight() // 2) - (height // 2)
dialog.geometry('+{}+{}'.format(x, y))

dialog.transient(root)
dialog.grab_set()
dialog.wait_window() # 等待对话框关闭

def export_to_xls():
global pdf_files_folder
if pdf_files_folder is None:
messagebox.showerror("错误", "PDF文件目录未设置。")
return
xls_file_path = os.path.join(pdf_files_folder, '发票数据.xls')
# 创建一个Workbook对象
workbook = xlwt.Workbook()
# 添加一个sheet
sheet = workbook.add_sheet('发票数据')
sheet.col(0).width = 256 * 5 # 序号列设为20个字符宽度
sheet.col(1).width = 256 * 10 # 金额列
sheet.col(2).width = 256 * 30 # 发票号码列
sheet.col(3).width = 256 * 30 # 公司列
sheet.col(4).width = 256 * 20 # 开票日期列
sheet.col(5).width = 256 * 25 # 类别列
sheet.col(6).width = 256 * 10 # 文件路径列

style = xlwt.easyxf('align: vert centre, horiz centre')
# 将Treeview列标题写入sheet的第一行
for i, header in enumerate(headers):
sheet.write(0, i, header, style)

# 写入数据
for i, item in enumerate(tree.get_children(), start=1):
# 获取每一行的数据
row_values = tree.item(item, 'values')
for j, value in enumerate(row_values):
# 假设你想将所有的单元格格式设置为文本,请确保这里的value是字符串
sheet.write(i, j, str(value), style) # 写入数据,注意i是从1开始的因为0是标题行

# 保存xls文件
workbook.save(xls_file_path)
messagebox.showinfo("完成", f"数据成功导出至 '{xls_file_path}' 文件。")


# 定义复制总金额到剪贴板的功能
def copy_total_amount_to_clipboard(total_amount):
root.clipboard_clear()
root.clipboard_append(f"{total_amount:.2f}")

# 树视图设置
tree = ttk.Treeview(main_frame, columns=("序号", "金额", "发票号码", "公司", "开票日期", "类别", "文件路径"), show="headings")

tree.grid(column=0, row=0, pady=5, padx=5, sticky=(tk.N, tk.S, tk.E, tk.W))
tree.bind('<Double-1>', lambda event: open_pdf(tree.item(tree.selection())['values'][6]))


# 默认的列宽和标题设置
columns = ("序号", "金额", "发票号码", "公司", "开票日期", "类别", "文件路径")
headers = ("序号", "金额", "发票号码", "公司", "开票日期", "类别", "文件路径")
column_widths = {"序号": 50, "金额": 100, "发票号码": 200, "公司": 180, "开票日期": 100, "类别": 100, "文件路径": 50}
for col in tree['columns']:
# 使用列的名字作为键来从字典中获取列的宽度
tree.column(col, width=column_widths[col])
tree.heading(col, text=col)
tree.column(col, anchor='center')

# 插入数据到树视图
for index, value in enumerate(values, start=1):
tree.insert("", "end", values=(index, *value))

# 右侧按钮容器
button_frame = ttk.Frame(main_frame)
button_frame.grid(column=1, row=0, padx=5, sticky=(tk.N, tk.S))

button_frame.rowconfigure(0, weight=1)
button_frame.rowconfigure(1, weight=1)
button_frame.rowconfigure(2, weight=1)
button_frame.rowconfigure(3, weight=1)

# 创建按钮并添加到右侧按钮容器
rename_button = ttk.Button(button_frame, text="重命名选中文件", command=rename_selected_files)
rename_button.grid(column=0, row=0, pady=5, sticky=(tk.N, tk.S))

copy_button = ttk.Button(button_frame, text="复制总金额", command=lambda: copy_total_amount_to_clipboard(total_amount))
copy_button.grid(column=0, row=1, pady=5, sticky=(tk.N, tk.S))

export_button = ttk.Button(button_frame, text="导出到XLS", command=export_to_xls)
export_button.grid(column=0, row=2, pady=5, sticky=(tk.N, tk.S)) # 注意:row的索引是3,因为我们已经有三个按钮了

exit_button = ttk.Button(button_frame, text="退出", command=root.destroy)
exit_button.grid(column=0, row=3, pady=5, sticky=(tk.N, tk.S))
# 累计金额标签
total_amount_label = ttk.Label(main_frame, text=f"所有发票的累计金额: {total_amount:.2f}")
total_amount_label.grid(column=0, row=1, pady=5, sticky=tk.W)

# 启动事件循环前更新窗口
root.update()

# 启动事件循环
root.mainloop()

logging.info('Exited main loop')
def copy_total_amount_to_clipboard():
root.clipboard_clear()
root.clipboard_append(f"{total_amount:.2f}")

root.mainloop()
logging.info('Exited main loop')
tree = None
root = None
def resize_treeview_columns(event):
global tree, root
new_width = event.width
tree.column("Index", width=int(new_width*0.05), anchor=tk.CENTER)
tree.column("Amount", width=int(new_width*0.05), anchor=tk.CENTER)
tree.column("Invoice Number", width=int(new_width*0.15), anchor=tk.CENTER)
tree.column("Name", width=int(new_width*0.15), anchor=tk.CENTER)
tree.column("Date", width=int(new_width*0.1), anchor=tk.CENTER)
tree.column("Category", width=int(new_width*0.15), anchor=tk.CENTER)
tree.column("Renamed File", width=int(new_width*0.15), anchor=tk.CENTER)
tree.update_idletasks()


total_amount_label = ttk.Label(main_frame, text=f"所有发票的累计金额: {total_amount:.2f}")
total_amount_label.grid(column=0, row=1, pady=5, sticky=tk.W)


def copy_total_amount_to_clipboard():
root.clipboard_clear()
root.clipboard_append(total_amount_label["text"].split(": ")[1])

copy_button = ttk.Button(main_frame, text="复制总金额", command=copy_total_amount_to_clipboard)
copy_button.grid(column=1, row=1, pady=5, padx=5, sticky=tk.W)

ttk.Button(main_frame, text="退出", command=root.destroy).grid(column=1, row=2, pady=10, sticky=tk.E)

root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
root.mainloop()

def center_window(root, width=420, height=100):
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()

x = (screen_width - width) // 2
y = (screen_height - height) // 2

root.geometry(f"{width}x{height}+{x}+{y}")


def browse_folder(entry):
folder = filedialog.askdirectory(title="请选择发票PDF文件夹路径")
if folder:
entry.delete(0, tk.END)
entry.insert(0, folder)

def start_processing(entry, input_root):
logging.info('Start processing')
folder = entry.get()
if not folder:
messagebox.showerror("错误", "请先选择或输入一个文件夹路径")
return

global pdf_files_folder
pdf_files = get_pdf_files(folder)
if pdf_files:
pdf_files_folder = os.path.dirname(pdf_files[0])

values = []
renamed_pdf_files = []

for pdf_file in pdf_files:
pdf_content = read_pdf_content(pdf_file)
pdf_text = pdf_content["text"]
pattern = r'[&#165;¥](\d+(?:\.\d{1,2})?)'
result = re.findall(pattern, pdf_text)

if result:
max_value = max([float(i) for i in result])
values.append((max_value,
pdf_content["invoice_number"],
pdf_content["name"],
pdf_content["date"],
pdf_content["category"],
pdf_file)) # 将原始pdf路径保存而不是重命名后的路径


logging.info(f'Values: {values}')
logging.info(f'Renamed pdf files: {renamed_pdf_files}')

# Extract the first element of each tuple in values
amounts = [value[0] for value in values]

total_amount = sum(amounts)

logging.info(f'Total amount: {total_amount}')
logging.info('Calling display_results function')

display_results(values, total_amount, input_root)

logging.info('Finished processing')

def main():
logging.info('Application start')
root = tk.Tk()
root.title("发票金额统计")

center_window(root)

main_frame = ttk.Frame(root, padding="10 10 10 10")
main_frame.grid(column=0, row=0, sticky=(tk.W, tk.E, tk.N, tk.S))

folder_entry = ttk.Entry(main_frame, width=40)
folder_entry.grid(column=0, row=0, padx=5, pady=5, sticky=tk.W)

browse_button = ttk.Button(main_frame, text="打开", command=lambda: browse_folder(folder_entry))
browse_button.grid(column=1, row=0, padx=5, pady=5, sticky=tk.W)

start_button = ttk.Button(main_frame, text="开始处理", command=lambda: start_processing(folder_entry, root))
start_button.grid(column=1, row=1, padx=5, pady=5, sticky=tk.W)

root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
root.mainloop()
logging.info('Application finished')
if __name__ == "__main__":
main()