Skip to content

Commit abcd3d2

Browse files
authored
refactor (#1124)
### What problem does this PR solve? ### Type of change - [x] Refactoring
1 parent 2cc8921 commit abcd3d2

File tree

7 files changed

+73
-83
lines changed

7 files changed

+73
-83
lines changed

api/apps/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ def register_page(page_path):
8585
url_prefix = f'/api/{API_VERSION}/{page_name}' if "_api" in path else f'/{API_VERSION}/{page_name}'
8686

8787
app.register_blueprint(page.manager, url_prefix=url_prefix)
88-
print(f'API file: {page_path}, URL: {url_prefix}')
8988
return url_prefix
9089

9190

api/apps/document_app.py

+63-66
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from rag.utils.minio_conn import MINIO
4141
from api.utils.file_utils import filename_type, thumbnail
4242
from api.utils.web_utils import html2pdf, is_valid_url
43+
from api.utils.web_utils import html2pdf, is_valid_url
4344

4445

4546
@manager.route('/upload', methods=['POST'])
@@ -117,6 +118,68 @@ def upload():
117118
return get_json_result(data=True)
118119

119120

121+
@manager.route('/web_crawl', methods=['POST'])
122+
@login_required
123+
@validate_request("kb_id", "name", "url")
124+
def web_crawl():
125+
kb_id = request.form.get("kb_id")
126+
if not kb_id:
127+
return get_json_result(
128+
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
129+
name = request.form.get("name")
130+
url = request.form.get("url")
131+
if not is_valid_url(url):
132+
return get_json_result(
133+
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
134+
e, kb = KnowledgebaseService.get_by_id(kb_id)
135+
if not e:
136+
raise LookupError("Can't find this knowledgebase!")
137+
138+
blob = html2pdf(url)
139+
if not blob: return server_error_response(ValueError("Download failure."))
140+
141+
root_folder = FileService.get_root_folder(current_user.id)
142+
pf_id = root_folder["id"]
143+
FileService.init_knowledgebase_docs(pf_id, current_user.id)
144+
kb_root_folder = FileService.get_kb_folder(current_user.id)
145+
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
146+
147+
try:
148+
filename = duplicate_name(
149+
DocumentService.query,
150+
name=name+".pdf",
151+
kb_id=kb.id)
152+
filetype = filename_type(filename)
153+
if filetype == FileType.OTHER.value:
154+
raise RuntimeError("This type of file has not been supported yet!")
155+
156+
location = filename
157+
while MINIO.obj_exist(kb_id, location):
158+
location += "_"
159+
MINIO.put(kb_id, location, blob)
160+
doc = {
161+
"id": get_uuid(),
162+
"kb_id": kb.id,
163+
"parser_id": kb.parser_id,
164+
"parser_config": kb.parser_config,
165+
"created_by": current_user.id,
166+
"type": filetype,
167+
"name": filename,
168+
"location": location,
169+
"size": len(blob),
170+
"thumbnail": thumbnail(filename, blob)
171+
}
172+
if doc["type"] == FileType.VISUAL:
173+
doc["parser_id"] = ParserType.PICTURE.value
174+
if re.search(r"\.(ppt|pptx|pages)$", filename):
175+
doc["parser_id"] = ParserType.PRESENTATION.value
176+
DocumentService.insert(doc)
177+
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
178+
except Exception as e:
179+
return server_error_response(e)
180+
return get_json_result(data=True)
181+
182+
120183
@manager.route('/create', methods=['POST'])
121184
@login_required
122185
@validate_request("name", "kb_id")
@@ -417,69 +480,3 @@ def get_image(image_id):
417480
return response
418481
except Exception as e:
419482
return server_error_response(e)
420-
421-
422-
@manager.route('/web_crawl', methods=['POST'])
423-
@login_required
424-
def web_crawl():
425-
kb_id = request.form.get("kb_id")
426-
if not kb_id:
427-
return get_json_result(
428-
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
429-
name = request.form.get("name")
430-
url = request.form.get("url")
431-
if not name:
432-
return get_json_result(
433-
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
434-
if not url:
435-
return get_json_result(
436-
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
437-
if not is_valid_url(url):
438-
return get_json_result(
439-
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
440-
e, kb = KnowledgebaseService.get_by_id(kb_id)
441-
if not e:
442-
raise LookupError("Can't find this knowledgebase!")
443-
444-
root_folder = FileService.get_root_folder(current_user.id)
445-
pf_id = root_folder["id"]
446-
FileService.init_knowledgebase_docs(pf_id, current_user.id)
447-
kb_root_folder = FileService.get_kb_folder(current_user.id)
448-
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
449-
450-
try:
451-
filename = duplicate_name(
452-
DocumentService.query,
453-
name=name+".pdf",
454-
kb_id=kb.id)
455-
filetype = filename_type(filename)
456-
if filetype == FileType.OTHER.value:
457-
raise RuntimeError("This type of file has not been supported yet!")
458-
459-
location = filename
460-
while MINIO.obj_exist(kb_id, location):
461-
location += "_"
462-
blob = html2pdf(url)
463-
MINIO.put(kb_id, location, blob)
464-
doc = {
465-
"id": get_uuid(),
466-
"kb_id": kb.id,
467-
"parser_id": kb.parser_id,
468-
"parser_config": kb.parser_config,
469-
"created_by": current_user.id,
470-
"type": filetype,
471-
"name": filename,
472-
"location": location,
473-
"size": len(blob),
474-
"thumbnail": thumbnail(filename, blob)
475-
}
476-
if doc["type"] == FileType.VISUAL:
477-
doc["parser_id"] = ParserType.PICTURE.value
478-
if re.search(r"\.(ppt|pptx|pages)$", filename):
479-
doc["parser_id"] = ParserType.PRESENTATION.value
480-
DocumentService.insert(doc)
481-
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
482-
except Exception as e:
483-
return get_json_result(
484-
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
485-
return get_json_result(data=True)

api/db/services/dialog_service.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,15 @@ def chat(dialog, messages, stream=True, **kwargs):
112112
prompt_config["system"] = prompt_config["system"].replace(
113113
"{%s}" % p["key"], " ")
114114

115+
rerank_mdl = None
116+
if dialog.rerank_id:
117+
rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id)
118+
115119
for _ in range(len(questions) // 2):
116120
questions.append(questions[-1])
117121
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
118122
kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
119123
else:
120-
rerank_mdl = None
121-
if dialog.rerank_id:
122-
rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id)
123124
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
124125
dialog.similarity_threshold,
125126
dialog.vector_similarity_weight,

api/utils/api_utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,12 @@ def construct_result(code=RetCode.DATA_ERROR, message='data is missing'):
248248

249249

250250
def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
251-
if data == None:
251+
if data is None:
252252
return jsonify({"code": code, "message": message})
253253
else:
254254
return jsonify({"code": code, "message": message, "data": data})
255255

256+
256257
def construct_error_response(e):
257258
stat_logger.exception(e)
258259
try:

api/utils/log_utils.py

-5
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,6 @@ def get_handler(class_name, level=None, log_dir=None,
154154
delay=True)
155155
if level:
156156
handler.level = level
157-
else:
158-
handler.level = LoggerFactory.LEVEL
159-
160-
formatter = logging.Formatter(LoggerFactory.LOG_FORMAT)
161-
handler.setFormatter(formatter)
162157

163158
return handler
164159

api/utils/web_utils.py

-2
Original file line numberDiff line numberDiff line change
@@ -78,5 +78,3 @@ def __get_pdf_from_html(
7878

7979
def is_valid_url(url: str) -> bool:
8080
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
81-
82-

rag/llm/embedding_model.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,8 @@
2626
from openai import OpenAI
2727
from FlagEmbedding import FlagModel
2828
import torch
29-
import asyncio
3029
import numpy as np
31-
30+
import asyncio
3231
from api.utils.file_utils import get_home_cache_dir
3332
from rag.utils import num_tokens_from_string, truncate
3433

@@ -317,12 +316,12 @@ def __init__(
317316
engine_kwargs: dict = {},
318317
key = None,
319318
):
320-
319+
321320
from infinity_emb import EngineArgs
322321
from infinity_emb.engine import AsyncEngineArray
323-
322+
324323
self._default_model = model_names[0]
325-
self.engine_array = AsyncEngineArray.from_args([EngineArgs(model_name_or_path = model_name, **engine_kwargs) for model_name in model_names])
324+
self.engine_array = AsyncEngineArray.from_args([EngineArgs(model_name_or_path = model_name, **engine_kwargs) for model_name in model_names])
326325

327326
async def _embed(self, sentences: list[str], model_name: str = ""):
328327
if not model_name:

0 commit comments

Comments
 (0)