-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_images.py
265 lines (225 loc) · 9.9 KB
/
process_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""
File: process_images.py
Description: This scipt takes a docx as input, and outputs a docx. It iterates through the images of the
input document, asking an LLM if it is an image of a sequence diagram. If it is, it asks the LLM again to
describe the diagram, appending the image with the AI description to the output document.
Contributors:
Marcelo Santibáñez
David Schön
Adrian Hassa
Created: 2024-11-07
Last Modified: 2024-12-10
Project: 3GPP Requirement Tools
URL: https://github.com/Adrian2901/3gpp-requirements-tools
License: MIT License (see LICENSE file for details)
"""
import os
import requests
import json
import cv2
import easyocr
import base64
from docx import Document
from docx.shared import Inches
from docx2python import docx2python
from PIL import Image
def preprocess_image(image_path, blur=1):
'''
Preprocess the image by applying different kernels to make it easier for OCR to detect text.
:param image_path: The path to the image to be preprocessed
:param blur: The ksize of the blur kernel to be applied to the image
:return: The preprocessed image
'''
# Load the image
img = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply thresholding to make the text stand out
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Kernels for removing horizontal and vertical lines
# kernel_h = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
kernel_v = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
# Remove horizontal lines
# horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel_h, iterations=2)
# binary_no_hlines = cv2.subtract(binary, horizontal_lines)
# Remove vertical lines
vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel_v, iterations=1)
binary_no_lines = cv2.subtract(binary, vertical_lines)
# Apply slight blur to reduce noise
blurred_img = cv2.medianBlur(binary_no_lines, blur)
# Invert the binary image
preprocessed_img = cv2.bitwise_not(blurred_img)
return preprocessed_img
def process_sequence_diagram(image_path, debug=False):
'''
Process a sequence diagram image to extract actors and messages.
:param image_path: The path to the image to be processed
:param debug: Boolean value to enable debug mode (saves intermediate images)
:return: A string containing the extracted actors and messages
'''
# Threshold for actors detection (anything above this threshold is considered an actor)
actors_threshold = 0.08
# Preprocess the image
img = preprocess_image(image_path)
# DEBUG: Save the preprocessed image
if debug:
cv2.imwrite("preprocessed_image.png", img)
# Use the easyocr model to extract text
reader = easyocr.Reader(['en'])
results = reader.readtext(img)
# Lists for storing relevant texts and bounding boxes
texts, bboxes = [], []
# Get text and bounding box info from easyocr results
for (bbox, text, conf) in results:
# Remove empty strings
if text.strip():
# Remove the "|" character (sometimes dashed lines are detected by the ocr)
if text != "|":
# Add the text and bounding box coordinates to the lists
texts.append(text)
x_min = min(bbox, key=lambda p: p[0])[0]
y_min = min(bbox, key=lambda p: p[1])[1]
x_max = max(bbox, key=lambda p: p[0])[0]
y_max = max(bbox, key=lambda p: p[1])[1]
bboxes.append((x_min, y_min, x_max - x_min, y_max - y_min))
# Lists for storing combined overlapping texts
combined_texts = []
combined_bboxes = []
# Iterate over the texts and bboxes to combine overlapping texts
for i, (text, bbox) in enumerate(zip(texts, bboxes)):
# Get the bounding box coordinates
x, y, w, h = bbox
# If the text is above the actors threshold or there are no combined bboxes yet
if not combined_bboxes or y < img.shape[0] * actors_threshold:
combined_texts.append(text)
combined_bboxes.append(bbox)
else:
# Get the previous bbox
prev_x, prev_y, prev_w, prev_h = combined_bboxes[-1]
# Check if the current bbox overlaps with the previous one in height
overlap_height = min(y + h, prev_y + prev_h) - max(y, prev_y)
# If the overlap is more than half of the smaller height, combine the texts
if overlap_height >= 0.5 * min(h, prev_h):
# Combine the texts
combined_texts[-1] += " " + text
# Combine the bboxes
new_x = min(prev_x, x)
new_y = min(prev_y, y)
new_w = max(prev_x + prev_w, x + w) - new_x
new_h = max(prev_y + prev_h, y + h) - new_y
combined_bboxes[-1] = (new_x, new_y, new_w, new_h)
else:
# If there is no overlap, add the text and bbox to the lists
combined_texts.append(text)
combined_bboxes.append(bbox)
# Update the texts and bboxes lists
texts = combined_texts
bboxes = combined_bboxes
# DEBUG: Save an image with bounding boxes drawn over the detected texts
if debug:
for (x, y, w, h) in bboxes:
cv2.rectangle(img, (int(x), int(y)), (int(x + w), int(y + h)), (0, 255, 0), 2)
cv2.imwrite("debug_image.png", img)
# Identify actors and messages
actors = []
messages = []
# Iterate over the texts and bboxes
for text, (x, y, w, h) in zip(texts, bboxes):
# Check if the text is above the actors threshold
if y < img.shape[0] * actors_threshold:
# Store actor name and x-center (to be used in arrow detection)
actors.append((text, x + w // 2))
else:
# Store message and bbox
messages.append((text, (x, y, w, h)))
# Output string
output = ""
# Print actors and messages
output += "Actors:\n"
for actor, _ in actors:
output += f"\t{actor}\n"
output += "Messages:\n"
for message, bbox in messages:
output += f"\t{message}\n"
return output
def is_sequence_diagram(image_path, llm_address, prompts):
'''
Ask the multimodal LLM whether the attached image is a sequence diagram.
:param image_path: The path to the image to be analyzed
:param llm_address: The address of the LLM model
:param prompts: The prompts dictionary
:return: Boolean value indicating whether the image is a sequence diagram
'''
# Encode the image to base64 so that it can be sent to the LLM model
encoded_image = base64.b64encode(open(image_path, "rb").read()).decode('utf-8')
# Construct the data to be sent to the LLM model
prompt_text = prompts['verify_image_context']
url = f'http://{llm_address}/api/generate'
data = {
"model": "minicpm-v",
"prompt": prompt_text,
"images": [encoded_image],
"stream": False
}
headers = {'Content-Type': 'application/json'}
# Send the request to the LLM model and extract the response from the JSON data
try:
response = requests.post(url, data=json.dumps(data), headers=headers)
json_data = json.loads(response.text)
text = json_data['response']
except requests.exceptions.RequestException as e:
print(f"Error prompting the LLM: {e}")
# If the request doesn't go through, return True, to not lose anything
return True
# List storing "relevant" answers in lowercase to compare with the LLM response
relevant_answers = ["yes", "yes."]
if text.lower() in relevant_answers:
return True
else:
# Print the other answers for debugging purposes
print(image_path + ": " + text)
return False
def process_docx(docx_path, output_folder, llm_address, update):
'''
Process a .docx document to extract sequence diagrams and their descriptions.
:param docx_path: The path to the input .docx document
:param output_folder: The path to the output folder
:param llm_address: The address of the LLM model
:param update: The function to call to update the GUI status label and progress bar
'''
output_file_path = os.path.join(output_folder, "diagrams.docx")
output_folder = os.path.join(output_folder, "images")
with open('prompts.json', 'r') as f:
prompts = json.load(f)
if not os.path.exists(docx_path):
update("Error: the input document was not found.")
return None
input_doc = docx2python(docx_path, output_folder, html=True)
output_doc = Document()
current_section = "No section"
lines = input_doc.text.splitlines()
i = 0
for line in lines:
update(f"Processing the document...", i / len(lines))
i += 1
if "<h2>" in line or "<h3>" in line:
current_section = line
if "media/image" in line:
img_name = line[10:].split('-')[0]
img_path = os.path.join(output_folder, img_name)
new_name = img_name.replace(".emf", ".png")
new_path = os.path.join(output_folder, new_name)
try:
Image.open(img_path).save(output_folder + "/" + new_name)
os.remove(img_path)
if is_sequence_diagram(new_path, llm_address, prompts):
output_doc.add_heading(current_section[4:-4], level=1)
output_doc.add_picture(new_path, width=Inches(6))
output_doc.add_paragraph(process_sequence_diagram(new_path))
output_doc.add_page_break()
except Exception as e:
update(f"Error adding image {img_name} to the document.")
output_doc.save(output_file_path)
update(f"Finished processing the document. Check the output folder for the results.")
if __name__ == "__main__":
print("usage: python gui_process_images.py")