|
40 | 40 | from rag.utils.minio_conn import MINIO
|
41 | 41 | from api.utils.file_utils import filename_type, thumbnail
|
42 | 42 | from api.utils.web_utils import html2pdf, is_valid_url
|
| 43 | +from api.utils.web_utils import html2pdf, is_valid_url |
43 | 44 |
|
44 | 45 |
|
45 | 46 | @manager.route('/upload', methods=['POST'])
|
@@ -117,6 +118,68 @@ def upload():
|
117 | 118 | return get_json_result(data=True)
|
118 | 119 |
|
119 | 120 |
|
| 121 | +@manager.route('/web_crawl', methods=['POST']) |
| 122 | +@login_required |
| 123 | +@validate_request("kb_id", "name", "url") |
| 124 | +def web_crawl(): |
| 125 | + kb_id = request.form.get("kb_id") |
| 126 | + if not kb_id: |
| 127 | + return get_json_result( |
| 128 | + data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR) |
| 129 | + name = request.form.get("name") |
| 130 | + url = request.form.get("url") |
| 131 | + if not is_valid_url(url): |
| 132 | + return get_json_result( |
| 133 | + data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR) |
| 134 | + e, kb = KnowledgebaseService.get_by_id(kb_id) |
| 135 | + if not e: |
| 136 | + raise LookupError("Can't find this knowledgebase!") |
| 137 | + |
| 138 | + blob = html2pdf(url) |
| 139 | + if not blob: return server_error_response(ValueError("Download failure.")) |
| 140 | + |
| 141 | + root_folder = FileService.get_root_folder(current_user.id) |
| 142 | + pf_id = root_folder["id"] |
| 143 | + FileService.init_knowledgebase_docs(pf_id, current_user.id) |
| 144 | + kb_root_folder = FileService.get_kb_folder(current_user.id) |
| 145 | + kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) |
| 146 | + |
| 147 | + try: |
| 148 | + filename = duplicate_name( |
| 149 | + DocumentService.query, |
| 150 | + name=name+".pdf", |
| 151 | + kb_id=kb.id) |
| 152 | + filetype = filename_type(filename) |
| 153 | + if filetype == FileType.OTHER.value: |
| 154 | + raise RuntimeError("This type of file has not been supported yet!") |
| 155 | + |
| 156 | + location = filename |
| 157 | + while MINIO.obj_exist(kb_id, location): |
| 158 | + location += "_" |
| 159 | + MINIO.put(kb_id, location, blob) |
| 160 | + doc = { |
| 161 | + "id": get_uuid(), |
| 162 | + "kb_id": kb.id, |
| 163 | + "parser_id": kb.parser_id, |
| 164 | + "parser_config": kb.parser_config, |
| 165 | + "created_by": current_user.id, |
| 166 | + "type": filetype, |
| 167 | + "name": filename, |
| 168 | + "location": location, |
| 169 | + "size": len(blob), |
| 170 | + "thumbnail": thumbnail(filename, blob) |
| 171 | + } |
| 172 | + if doc["type"] == FileType.VISUAL: |
| 173 | + doc["parser_id"] = ParserType.PICTURE.value |
| 174 | + if re.search(r"\.(ppt|pptx|pages)$", filename): |
| 175 | + doc["parser_id"] = ParserType.PRESENTATION.value |
| 176 | + DocumentService.insert(doc) |
| 177 | + FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) |
| 178 | + except Exception as e: |
| 179 | + return server_error_response(e) |
| 180 | + return get_json_result(data=True) |
| 181 | + |
| 182 | + |
120 | 183 | @manager.route('/create', methods=['POST'])
|
121 | 184 | @login_required
|
122 | 185 | @validate_request("name", "kb_id")
|
@@ -417,69 +480,3 @@ def get_image(image_id):
|
417 | 480 | return response
|
418 | 481 | except Exception as e:
|
419 | 482 | return server_error_response(e)
|
420 |
| - |
421 |
| - |
422 |
| -@manager.route('/web_crawl', methods=['POST']) |
423 |
| -@login_required |
424 |
| -def web_crawl(): |
425 |
| - kb_id = request.form.get("kb_id") |
426 |
| - if not kb_id: |
427 |
| - return get_json_result( |
428 |
| - data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR) |
429 |
| - name = request.form.get("name") |
430 |
| - url = request.form.get("url") |
431 |
| - if not name: |
432 |
| - return get_json_result( |
433 |
| - data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR) |
434 |
| - if not url: |
435 |
| - return get_json_result( |
436 |
| - data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR) |
437 |
| - if not is_valid_url(url): |
438 |
| - return get_json_result( |
439 |
| - data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR) |
440 |
| - e, kb = KnowledgebaseService.get_by_id(kb_id) |
441 |
| - if not e: |
442 |
| - raise LookupError("Can't find this knowledgebase!") |
443 |
| - |
444 |
| - root_folder = FileService.get_root_folder(current_user.id) |
445 |
| - pf_id = root_folder["id"] |
446 |
| - FileService.init_knowledgebase_docs(pf_id, current_user.id) |
447 |
| - kb_root_folder = FileService.get_kb_folder(current_user.id) |
448 |
| - kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) |
449 |
| - |
450 |
| - try: |
451 |
| - filename = duplicate_name( |
452 |
| - DocumentService.query, |
453 |
| - name=name+".pdf", |
454 |
| - kb_id=kb.id) |
455 |
| - filetype = filename_type(filename) |
456 |
| - if filetype == FileType.OTHER.value: |
457 |
| - raise RuntimeError("This type of file has not been supported yet!") |
458 |
| - |
459 |
| - location = filename |
460 |
| - while MINIO.obj_exist(kb_id, location): |
461 |
| - location += "_" |
462 |
| - blob = html2pdf(url) |
463 |
| - MINIO.put(kb_id, location, blob) |
464 |
| - doc = { |
465 |
| - "id": get_uuid(), |
466 |
| - "kb_id": kb.id, |
467 |
| - "parser_id": kb.parser_id, |
468 |
| - "parser_config": kb.parser_config, |
469 |
| - "created_by": current_user.id, |
470 |
| - "type": filetype, |
471 |
| - "name": filename, |
472 |
| - "location": location, |
473 |
| - "size": len(blob), |
474 |
| - "thumbnail": thumbnail(filename, blob) |
475 |
| - } |
476 |
| - if doc["type"] == FileType.VISUAL: |
477 |
| - doc["parser_id"] = ParserType.PICTURE.value |
478 |
| - if re.search(r"\.(ppt|pptx|pages)$", filename): |
479 |
| - doc["parser_id"] = ParserType.PRESENTATION.value |
480 |
| - DocumentService.insert(doc) |
481 |
| - FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) |
482 |
| - except Exception as e: |
483 |
| - return get_json_result( |
484 |
| - data=False, retmsg=e, retcode=RetCode.SERVER_ERROR) |
485 |
| - return get_json_result(data=True) |
0 commit comments