Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine table parser #120

Merged
merged 1 commit into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/db/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
.where(
Document.status == StatusEnum.VALID.value,
Document.run == TaskStatus.RUNNING.value,
~(Document.type == FileType.VIRTUAL.value),
cls.model.progress == 0,
cls.model.update_time >= tm,
Expand Down
4 changes: 3 additions & 1 deletion deepdoc/parser/ppt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def __call__(self, fnm, from_page, to_page, callback=None):
BytesIO(fnm))
txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides[from_page: to_page]):
for i, slide in enumerate(ppt.slides):
if i < from_page: continue
if i >= to_page:break
texts = []
for shape in slide.shapes:
txt = self.__extract(shape)
Expand Down
5 changes: 4 additions & 1 deletion rag/app/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
import copy
import re
from io import BytesIO

from PIL import Image

from rag.nlp import tokenize, is_english
from rag.nlp import huqie
from deepdoc.parser import PdfParser, PptParser
Expand All @@ -30,7 +33,7 @@ def __call__(self, fnm, from_page, to_page, callback=None):
for i, slide in enumerate(presentation.slides[from_page: to_page]):
buffered = BytesIO()
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(buffered.getvalue())
imgs.append(Image.open(buffered))
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
Expand Down
14 changes: 7 additions & 7 deletions rag/app/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,9 @@ def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=
continue
data.append(row)
done += 1
if done % 999 == 0:
callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + (
f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else "")))
res.append(pd.DataFrame(np.array(data), columns=headers))

callback(0.6, ("Extract records: {}. ".format(done) + (
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res

Expand Down Expand Up @@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
headers = lines[0].split(kwargs.get("delimiter", "\t"))
rows = []
for i, line in enumerate(lines[1:]):
if from_page < from_page:continue
if i < from_page:continue
if i >= to_page: break
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
if len(row) != len(headers):
Expand Down Expand Up @@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i])
for i in range(len(clmns))]

eng = lang.lower() == "english"#is_english(txts)
for ii, row in df.iterrows():
d = {}
d = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
row_txt = []
for j in range(len(clmns)):
if row[clmns[j]] is None:
Expand Down
4 changes: 2 additions & 2 deletions rag/svr/task_broker.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def new_task():
tsks.append(task)
elif r["parser_id"] == "table":
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
for i in range(0, rn, 1000):
for i in range(0, rn, 3000):
task = new_task()
task["from_page"] = i
task["to_page"] = min(i + 1000, rn)
task["to_page"] = min(i + 3000, rn)
tsks.append(task)
else:
tsks.append(new_task())
Expand Down
7 changes: 3 additions & 4 deletions rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,6 @@ def build(row):

return

callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))

docs = []
doc = {
"doc_id": row["doc_id"],
Expand Down Expand Up @@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
tk_count += c

cnts_ = np.array([])
for i in range(0, len(cnts), 32):
vts, c = mdl.encode(cnts[i: i+32])
for i in range(0, len(cnts), 8):
vts, c = mdl.encode(cnts[i: i+8])
if len(cnts_) == 0: cnts_ = vts
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
tk_count += c
Expand Down Expand Up @@ -226,6 +224,7 @@ def main(comm, mod):
continue
# TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ")
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
try:
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e:
Expand Down