Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 41 additions & 65 deletions .agents/scripts/paddleocr-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -476,81 +476,57 @@ if not results:
print("[]")
sys.exit(0)

# Normalize both API paths into a common entries list to avoid output duplication
entries = []
if use_new_api:
# New API: OCRResult with rec_texts, rec_scores, rec_polys (list of 4-point polygons)
for result in results:
texts = result.get("rec_texts", [])
scores = result.get("rec_scores", [])
polys = result.get("rec_polys", [])

if not texts:
for i, text in enumerate(texts):
entry = {
"text": text,
"confidence": float(scores[i]) if i < len(scores) else 0.0,
}
if i < len(polys):
box = polys[i].tolist() if hasattr(polys[i], "tolist") else polys[i]
entry["box"] = box
entries.append(entry)
else:
# Legacy API: [[box, (text, confidence)], ...]
for page in results:
if page is None:
continue

if output_format == "json":
entries = []
for i, text in enumerate(texts):
entry = {
"text": text,
"confidence": round(float(scores[i]), 4) if i < len(scores) else 0.0,
}
if i < len(polys):
box = polys[i].tolist() if hasattr(polys[i], "tolist") else polys[i]
entry["box"] = box
entries.append(entry)
print(json.dumps(entries, ensure_ascii=False, indent=2))

elif output_format == "tsv":
print("text\tconfidence\tx1\ty1\tx2\ty2\tx3\ty3\tx4\ty4")
for i, text in enumerate(texts):
score = float(scores[i]) if i < len(scores) else 0.0
if i < len(polys):
box = polys[i].tolist() if hasattr(polys[i], "tolist") else polys[i]
coords = "\t".join(f"{p[0]:.0f}\t{p[1]:.0f}" for p in box)
else:
coords = "\t".join(["0"] * 8)
print(f"{text}\t{score:.4f}\t{coords}")

for line in page:
entries.append({
"text": line[1][0],
"confidence": line[1][1],
"box": line[0],
})

if output_format == "json":
for entry in entries:
if "confidence" in entry:
entry["confidence"] = round(entry["confidence"], 4)
print(json.dumps(entries, ensure_ascii=False, indent=2))

elif output_format == "tsv":
print("text\tconfidence\tx1\ty1\tx2\ty2\tx3\ty3\tx4\ty4")
for entry in entries:
text = entry.get("text", "")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

To ensure the integrity of the TSV output, it's important to sanitize the text field by removing any tab or newline characters. These characters can break the column structure of the TSV format, leading to parsing errors downstream.

Suggested change
text = entry.get("text", "")
text = entry.get("text", "").replace("\t", " ").replace("\n", " ")
References
  1. Employ a defense-in-depth strategy for handling data. Sanitize data at each point of use, especially before outputting it in a specific format like TSV, to prevent parsing errors and maintain integrity.

confidence = entry.get("confidence", 0.0)
box = entry.get("box")
if box:
coords = "\t".join(f"{p[0]:.0f}\t{p[1]:.0f}" for p in box)
else:
for text in texts:
print(text)
coords = "\t".join(["0"] * 8)
print(f"{text}\t{confidence:.4f}\t{coords}")

else:
# Legacy API: [[box, (text, confidence)], ...]
if output_format == "json":
entries = []
for page in results:
if page is None:
continue
for line in page:
box = line[0]
text = line[1][0]
confidence = line[1][1]
entries.append({
"text": text,
"confidence": round(confidence, 4),
"box": box,
})
print(json.dumps(entries, ensure_ascii=False, indent=2))

elif output_format == "tsv":
print("text\tconfidence\tx1\ty1\tx2\ty2\tx3\ty3\tx4\ty4")
for page in results:
if page is None:
continue
for line in page:
box = line[0]
text = line[1][0]
confidence = line[1][1]
coords = "\t".join(f"{p[0]:.0f}\t{p[1]:.0f}" for p in box)
print(f"{text}\t{confidence:.4f}\t{coords}")

else:
for page in results:
if page is None:
continue
for line in page:
text = line[1][0]
print(text)
# Plain text output
for entry in entries:
print(entry.get("text", ""))
PYEOF
)"

Expand Down
Loading