-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathremover.py
272 lines (209 loc) · 10.7 KB
/
remover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PyPDF2 - a Python library that allows reading, writing, and manipulating PDF files
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import NumberObject, TextStringObject, NameObject
from PyPDF2.generic import ContentStream
# tkinter - used for building the GUI (Graphical User Interface) application
from tkinter import Tk, Label, Button, StringVar
from tkinter.filedialog import askopenfilename, asksaveasfilename, askdirectory
from tkinter.constants import N,S,W,E, LEFT, TOP, RIGHT, BOTTOM
import sys, os
import tkinter.font as font
# reportlab - a library for generating PDFs programmatically
from reportlab.pdfgen import canvas
def resource_path(relative_path):
if hasattr(sys, '_MEIPASS'):
return os.path.join(sys._MEIPASS, relative_path)
return os.path.join(os.path.abspath("."), relative_path)
class PdfEnhancedFileWriter(PdfWriter):
colors_operands = {
'rgb': {
'black': [NumberObject(0), NumberObject(0), NumberObject(0)],
'white': [NumberObject(1), NumberObject(1), NumberObject(1)],
},
'cmyk': {
'black': [NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(1)],
'white': [NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(0)],
},
'grayscale': {
'black': [NumberObject(0)],
'white': [NumberObject(1)],
}
}
def _getOperatorType(self, operator):
operator_types = {
(b"Tj"): "text",
(b"'"): "text",
(b'"'): "text",
(b"TJ"): "text",
(b"rg"): "rgb", # color
(b"RG"): "rgb", # color
(b"k"): "cmyk", # color
(b"K"): "cmyk", # color
(b"g"): "grayscale", # color
(b"G"): "grayscale", # color
(b"re"): "rectangle",
(b"l"): "line", # line
(b"m"): "line", # start line
(b"S"): "line", # stroke(paint) line
}
if operator in operator_types:
return operator_types[operator]
return None
# get the operation type that the color affects on
def _getColorTargetOperationType(self, color_index, operations):
for i in range(color_index + 1, len(operations)):
operator = operations[i][1]
operator_type = self._getOperatorType(operator)
if operator_type == 'text' or operator_type == 'rectangle' or operator_type == 'line':
return operator_type
return False
def getMinimumRectangleWidth(self, fontSize, minimumNumberOfLetters = 1.5):
return fontSize * minimumNumberOfLetters
def removeWordStyle(self, ignoreByteStringObject=False):
"""
Removes imported styles from Word - Path Constructors rectangles - from this output.
:param bool ignoreByteStringObject: optional parameter
to ignore ByteString Objects.
"""
pages = self.get_object(self._pages)['/Kids']
for page in pages:
pageRef = self.get_object(page)
content = pageRef["/Contents"].get_object()
if not isinstance(content, ContentStream):
content = ContentStream(content, pageRef)
_operations = []
last_font_size = 0
for operator_index, (operands, operator) in enumerate(content.operations):
if operator == (b'Tf') and operands[0][:2] == '/F':
last_font_size = operands[1].as_numeric()
if operator == (b'Tj'):
text = operands[0]
if ignoreByteStringObject:
if not isinstance(text, TextStringObject):
operands[0] = TextStringObject()
elif operator == (b"'"):
text = operands[0]
if ignoreByteStringObject:
if not isinstance(text, TextStringObject):
operands[0] = TextStringObject()
elif operator == (b'"'):
text = operands[2]
if ignoreByteStringObject:
if not isinstance(text, TextStringObject):
operands[2] = TextStringObject()
elif operator == (b"TJ"):
for i in range(len(operands[0])):
if ignoreByteStringObject:
if not isinstance(operands[0][i], TextStringObject):
operands[0][i] = TextStringObject()
operator_type = self._getOperatorType(operator)
# we are ignoring all grayscale colors
# tests showed that black underlines,borders and tables are defined by grayscale and aren't using rgb/cmyk colors
if operator_type == 'rgb' or operator_type == 'cmyk':
color_target_operation_type = self._getColorTargetOperationType(operator_index, content.operations)
new_color = None
# we are coloring all text in black and all rectangles in white
# removing all colors paints rectangles in black which gives us unwanted results
if color_target_operation_type == 'text':
new_color = 'black'
elif color_target_operation_type == 'rectangle':
new_color = 'white'
if new_color:
operands = self.colors_operands[operator_type][new_color]
# remove styled rectangles (highlights, lines, etc.)
# the 're' operator is a Path Construction operator, creates a rectangle()
# presumably, that's the way word embedding all of it's graphics into a PDF when creating one
if operator == (b're'):
rectangle_width = operands[-2].as_numeric()
rectangle_height = operands[-1].as_numeric()
minWidth = self.getMinimumRectangleWidth(last_font_size, 1) # (length of X letters at the current size)
maxHeight = last_font_size + 6 # range to catch really big highlights
minHeight = 1.5 # so that thin lines will not be removed
# remove only style that:
# it's width are bigger than the minimum
# it's height is smaller than maximum and larger than minimum
if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight:
continue
_operations.append((operands, operator))
content.operations = _operations
pageRef.__setitem__(NameObject('/Contents'), content)
root = Tk()
root.title('Answers Remover 2.0')
#root.iconbitmap( resource_path('./icon.ico'))
pdf_list = []
filePaths = []
filename1 = StringVar()
src_pdf = StringVar()
def createMultiPage(file_path):
c = canvas.Canvas(file_path)
for i in range(c.getPageNumber()):
page_num = c.getPageNumber()
text = "This is page %s"
c.drawString(50, 50, text)
# c.showPage()
c.save()
def load_pdf(filename):
f = open(filename,'rb')
return PdfReader(f)
def load1():
f = askopenfilename(multiple=True, filetypes=(('PDF File', '*.pdf'), ('All Files', '*.*')))
var = root.tk.splitlist(f)
for file in var:
filePaths.append(file)
message_var = str(len(pdf_list) + 1) + " file(s) loaded"
filename1.set(message_var)
# filename1.set(file.split('/')[-1])
src_pdf = file
# print(file)
# print(src_pdf)
pdf1 = load_pdf(file)
pdf_list.append(pdf1)
# pdf_list.append(file)
print("Loaded " + file)
# prints the loaded list
#print(pdf_list)
def add_to_writer(pdfsrc, writer):
[writer.add_page(pdfsrc.pages[i]) for i in range(len(pdfsrc.pages))]
writer.removeWordStyle()
def remove_images():
writer = PdfEnhancedFileWriter()
# output_filename = asksaveasfilename(filetypes = (('PDF File', '*.pdf'), ('All Files','*.*')))
output_saving_dir = askdirectory(title="Choose output folder...")
i = 0
for file in pdf_list:
head, tail = os.path.split(filePaths[i])
print(tail)
file_path = os.path.join(output_saving_dir, "SCRAPED_" + tail)
outputfile = open(file_path, 'wb')
add_to_writer(file, writer)
writer.write(outputfile)
outputfile.close()
i = i + 1
print(str(i) + " file(s) done")
print("Job is done")
root.quit()
##Label(root, text="Rectangles remover").grid(row=0, column=2, sticky=E)
Button(root, text="Choose one or more PDFs", command=load1, height=5, width=20).grid(row=1, column=0)
Button(root, text="Choose one or more PDFs", command=load1, font='Helvetica 12 bold', fg="green", height=4,
width=20).grid(row=1, column=0)
Label(root, textvariable=filename1, width=20).grid(row=1, column=1, sticky=(N,S,E,W))
#photo= PhotoImage(file=resource_path('./button_pic.png'))
#Button(root, text="Remove answers",image=photo, command=remove_images, width=100, height=120).grid(row=1, column=2,sticky=E)
Button(root, text="Remove answers", command=remove_images, font='Helvetica 12 bold', fg="red", height=4).grid(row=1, column=2, sticky=E)
#Label(root, text="Remove Answers^^").grid(row=2, column=2, sticky=E)
#Label(root, text="Good Luck!").grid(row=2, column=0, sticky=W)
Label(root, text='''שימו לב,\n
האפליקציה מסירה אובייקטים מעוצבים שיובאו מוורד,\n
ולכן יש סיכוי שתסיר גם טבלאות ואלמנטים עיצוביים אחרים, אם קיימים.\n
הדף לא נפתח כראוי בתוכנות מסויימות של אדובי,\n
הפתרון הפשוט לכך הוא לחצן ימני על הקובץ שנוצר,\n
לחצן ימני > פתח באמצעות > כרום, פיירפוקס, או כל תוכנה אחרת שיודעת להציג פדף.\n
\n
וזיכרו: הפתרון הטוב ביותר יהיה לשלוח מייל חביב למתרגל האחראי לאחר המבחן\nולבקש ממנו להעלות גם גרסה ללא הפתרונות למען הסמסטרים הבאים.\n
\n
בהצלחה!\n''', font='Helvetica 7', justify=RIGHT).grid(row=3, columnspan=3, sticky=E)
for child in root.winfo_children():
child.grid_configure(padx=10, pady=10)
root.mainloop()