18
18
import warnings
19
19
from io import BytesIO
20
20
21
+ from elasticsearch_dsl import Q
21
22
from flask import request , send_file
22
23
from flask_login import login_required , current_user
23
24
from httpx import HTTPError
24
- from minio import S3Error
25
25
26
26
from api .contants import NAME_LENGTH_LIMIT
27
- from api .db import FileType , ParserType , FileSource
27
+ from api .db import FileType , ParserType , FileSource , TaskStatus
28
28
from api .db import StatusEnum
29
- from api .db .db_models import File
29
+ from api .db .db_models import File , Task
30
30
from api .db .services import duplicate_name
31
31
from api .db .services .document_service import DocumentService
32
32
from api .db .services .file2document_service import File2DocumentService
33
33
from api .db .services .file_service import FileService
34
34
from api .db .services .knowledgebase_service import KnowledgebaseService
35
+ from api .db .services .task_service import TaskService
35
36
from api .db .services .user_service import TenantService
36
37
from api .settings import RetCode
37
38
from api .utils import get_uuid
38
39
from api .utils .api_utils import construct_json_result , construct_error_response
39
40
from api .utils .api_utils import construct_result , validate_request
40
41
from api .utils .file_utils import filename_type , thumbnail
42
+ from rag .app import book , laws , manual , naive , one , paper , presentation , qa , resume , table , picture
43
+ from rag .nlp import search
44
+ from rag .utils .es_conn import ELASTICSEARCH
41
45
from rag .utils .minio_conn import MINIO
42
46
43
47
MAXIMUM_OF_UPLOADING_FILES = 256
44
48
49
+
45
50
# ------------------------------ create a dataset ---------------------------------------
46
51
47
52
@manager .route ("/" , methods = ["POST" ])
@@ -116,6 +121,7 @@ def create_dataset():
116
121
except Exception as e :
117
122
return construct_error_response (e )
118
123
124
+
119
125
# -----------------------------list datasets-------------------------------------------------------
120
126
121
127
@manager .route ("/" , methods = ["GET" ])
@@ -135,6 +141,7 @@ def list_datasets():
135
141
except HTTPError as http_err :
136
142
return construct_json_result (http_err )
137
143
144
+
138
145
# ---------------------------------delete a dataset ----------------------------
139
146
140
147
@manager .route ("/<dataset_id>" , methods = ["DELETE" ])
@@ -162,13 +169,15 @@ def remove_dataset(dataset_id):
162
169
163
170
# delete the dataset
164
171
if not KnowledgebaseService .delete_by_id (dataset_id ):
165
- return construct_json_result (code = RetCode .DATA_ERROR , message = "There was an error during the dataset removal process. "
166
- "Please check the status of the RAGFlow server and try the removal again." )
172
+ return construct_json_result (code = RetCode .DATA_ERROR ,
173
+ message = "There was an error during the dataset removal process. "
174
+ "Please check the status of the RAGFlow server and try the removal again." )
167
175
# success
168
176
return construct_json_result (code = RetCode .SUCCESS , message = f"Remove dataset: { dataset_id } successfully" )
169
177
except Exception as e :
170
178
return construct_error_response (e )
171
179
180
+
172
181
# ------------------------------ get details of a dataset ----------------------------------------
173
182
174
183
@manager .route ("/<dataset_id>" , methods = ["GET" ])
@@ -182,6 +191,7 @@ def get_dataset(dataset_id):
182
191
except Exception as e :
183
192
return construct_json_result (e )
184
193
194
+
185
195
# ------------------------------ update a dataset --------------------------------------------
186
196
187
197
@manager .route ("/<dataset_id>" , methods = ["PUT" ])
@@ -209,8 +219,9 @@ def update_dataset(dataset_id):
209
219
if name .lower () != dataset .name .lower () \
210
220
and len (KnowledgebaseService .query (name = name , tenant_id = current_user .id ,
211
221
status = StatusEnum .VALID .value )) > 1 :
212
- return construct_json_result (code = RetCode .DATA_ERROR , message = f"The name: { name .lower ()} is already used by other "
213
- f"datasets. Please choose a different name." )
222
+ return construct_json_result (code = RetCode .DATA_ERROR ,
223
+ message = f"The name: { name .lower ()} is already used by other "
224
+ f"datasets. Please choose a different name." )
214
225
215
226
dataset_updating_data = {}
216
227
chunk_num = req .get ("chunk_num" )
@@ -222,17 +233,21 @@ def update_dataset(dataset_id):
222
233
if chunk_num == 0 :
223
234
dataset_updating_data ["embd_id" ] = req ["embedding_model_id" ]
224
235
else :
225
- construct_json_result (code = RetCode .DATA_ERROR , message = "You have already parsed the document in this "
236
+ return construct_json_result (code = RetCode .DATA_ERROR , message = "You have already parsed the document in this "
226
237
"dataset, so you cannot change the embedding "
227
238
"model." )
228
239
# only if chunk_num is 0, the user can update the chunk_method
229
- if req .get ("chunk_method" ):
230
- if chunk_num == 0 :
231
- dataset_updating_data ['parser_id' ] = req ["chunk_method" ]
232
- else :
240
+ if "chunk_method" in req :
241
+ type_value = req ["chunk_method" ]
242
+ if is_illegal_value_for_enum (type_value , ParserType ):
243
+ return construct_json_result (message = f"Illegal value { type_value } for 'chunk_method' field." ,
244
+ code = RetCode .DATA_ERROR )
245
+ if chunk_num != 0 :
233
246
construct_json_result (code = RetCode .DATA_ERROR , message = "You have already parsed the document "
234
247
"in this dataset, so you cannot "
235
248
"change the chunk method." )
249
+ dataset_updating_data ["parser_id" ] = req ["template_type" ]
250
+
236
251
# convert the photo parameter to avatar
237
252
if req .get ("photo" ):
238
253
dataset_updating_data ["avatar" ] = req ["photo" ]
@@ -265,6 +280,7 @@ def update_dataset(dataset_id):
265
280
except Exception as e :
266
281
return construct_error_response (e )
267
282
283
+
268
284
# --------------------------------content management ----------------------------------------------
269
285
270
286
# ----------------------------upload files-----------------------------------------------------
@@ -339,9 +355,10 @@ def upload_documents(dataset_id):
339
355
location += "_"
340
356
341
357
blob = file .read ()
358
+
342
359
# the content is empty, raising a warning
343
360
if blob == b'' :
344
- warnings .warn (f"[WARNING]: The file { filename } is empty." )
361
+ warnings .warn (f"[WARNING]: The content of the file { filename } is empty." )
345
362
346
363
MINIO .put (dataset_id , location , blob )
347
364
@@ -453,6 +470,7 @@ def list_documents(dataset_id):
453
470
except Exception as e :
454
471
return construct_error_response (e )
455
472
473
+
456
474
# ----------------------------update: enable rename-----------------------------------------------------
457
475
@manager .route ("/<dataset_id>/documents/<document_id>" , methods = ["PUT" ])
458
476
@login_required
@@ -555,6 +573,7 @@ def update_document(dataset_id, document_id):
555
573
def is_illegal_value_for_enum (value , enum_class ):
556
574
return value not in enum_class .__members__ .values ()
557
575
576
+
558
577
# ----------------------------download a file-----------------------------------------------------
559
578
@manager .route ("/<dataset_id>/documents/<document_id>" , methods = ["GET" ])
560
579
@login_required
@@ -563,7 +582,8 @@ def download_document(dataset_id, document_id):
563
582
# Check whether there is this dataset
564
583
exist , _ = KnowledgebaseService .get_by_id (dataset_id )
565
584
if not exist :
566
- return construct_json_result (code = RetCode .DATA_ERROR , message = f"This dataset '{ dataset_id } ' cannot be found!" )
585
+ return construct_json_result (code = RetCode .DATA_ERROR ,
586
+ message = f"This dataset '{ dataset_id } ' cannot be found!" )
567
587
568
588
# Check whether there is this document
569
589
exist , document = DocumentService .get_by_id (document_id )
@@ -591,8 +611,142 @@ def download_document(dataset_id, document_id):
591
611
except Exception as e :
592
612
return construct_error_response (e )
593
613
594
- # ----------------------------start parsing-----------------------------------------------------
595
614
615
+ # ----------------------------start parsing a document-----------------------------------------------------
616
+ # helper method for parsing
617
+ def dummy (prog = None , msg = "" ):
618
+ pass
619
+
620
+
621
+ def doc_parse (binary , doc_name , parser_name , tenant_id ):
622
+ match parser_name :
623
+ case "book" :
624
+ book .chunk (doc_name , binary = binary , callback = dummy )
625
+ case "laws" :
626
+ laws .chunk (doc_name , binary = binary , callback = dummy )
627
+ case "manual" :
628
+ manual .chunk (doc_name , binary = binary , callback = dummy )
629
+ case "naive" :
630
+ # It's the mode by default, which is general in the front-end
631
+ naive .chunk (doc_name , binary = binary , callback = dummy )
632
+ case "one" :
633
+ one .chunk (doc_name , binary = binary , callback = dummy )
634
+ case "paper" :
635
+ paper .chunk (doc_name , binary = binary , callback = dummy )
636
+ case "picture" :
637
+ picture .chunk (doc_name , binary = binary , tenant_id = tenant_id , lang = "Chinese" , callback = dummy )
638
+ case "presentation" :
639
+ presentation .chunk (doc_name , binary = binary , callback = dummy )
640
+ case "qa" :
641
+ qa .chunk (doc_name , binary = binary , callback = dummy )
642
+ case "resume" :
643
+ resume .chunk (doc_name , binary = binary , callback = dummy )
644
+ case "table" :
645
+ table .chunk (doc_name , binary = binary , callback = dummy )
646
+ case _:
647
+ return False
648
+
649
+ return True
650
+
651
+
652
+ @manager .route ("/<dataset_id>/documents/<document_id>/status" , methods = ["POST" ])
653
+ @login_required
654
+ def parse_document (dataset_id , document_id ):
655
+ try :
656
+ # valid dataset
657
+ exist , _ = KnowledgebaseService .get_by_id (dataset_id )
658
+ if not exist :
659
+ return construct_json_result (code = RetCode .DATA_ERROR ,
660
+ message = f"This dataset '{ dataset_id } ' cannot be found!" )
661
+ message = ""
662
+ res = get_message_during_parsing_document (document_id , message )
663
+ if isinstance (res , str ):
664
+ message += res
665
+ return construct_json_result (code = RetCode .SUCCESS , message = message )
666
+ else :
667
+ return res
668
+
669
+ except Exception as e :
670
+ return construct_error_response (e )
671
+
672
+
673
+ # ----------------------------start parsing documents-----------------------------------------------------
674
+ @manager .route ("/<dataset_id>/documents/status" , methods = ["POST" ])
675
+ @login_required
676
+ def parse_documents (dataset_id ):
677
+ doc_ids = request .json ["doc_ids" ]
678
+ try :
679
+ exist , _ = KnowledgebaseService .get_by_id (dataset_id )
680
+ if not exist :
681
+ return construct_json_result (code = RetCode .DATA_ERROR ,
682
+ message = f"This dataset '{ dataset_id } ' cannot be found!" )
683
+
684
+ def process (doc_ids ):
685
+ message = ""
686
+ # for loop
687
+ for id in doc_ids :
688
+ res = get_message_during_parsing_document (id , message )
689
+ if isinstance (res , str ):
690
+ message += res
691
+ else :
692
+ return res
693
+ return construct_json_result (data = True , code = RetCode .SUCCESS , message = message )
694
+
695
+ # two conditions
696
+ if doc_ids :
697
+ return process (doc_ids )
698
+ else :
699
+ # documents inside the dataset
700
+ docs , total = DocumentService .list_documents_in_dataset (dataset_id , 0 , - 1 , "create_time" ,
701
+ True , "" )
702
+ doc_ids = [doc ["id" ] for doc in docs ]
703
+ return process (doc_ids )
704
+
705
+ except Exception as e :
706
+ return construct_error_response (e )
707
+
708
+
709
+ # helper method for getting message or response when parsing the document
710
+ def get_message_during_parsing_document (id , message ):
711
+ try :
712
+ # Check whether there is this document
713
+ exist , document = DocumentService .get_by_id (id )
714
+ if not exist :
715
+ return construct_json_result (message = f"This document '{ id } ' cannot be found!" ,
716
+ code = RetCode .ARGUMENT_ERROR )
717
+
718
+ tenant_id = DocumentService .get_tenant_id (id )
719
+ if not tenant_id :
720
+ return construct_json_result (message = "Tenant not found!" , code = RetCode .AUTHENTICATION_ERROR )
721
+
722
+ info = {"run" : "1" , "progress" : 0 }
723
+ info ["progress_msg" ] = ""
724
+ info ["chunk_num" ] = 0
725
+ info ["token_num" ] = 0
726
+
727
+ DocumentService .update_by_id (id , info )
728
+
729
+ ELASTICSEARCH .deleteByQuery (Q ("match" , doc_id = id ), idxnm = search .index_name (tenant_id ))
730
+
731
+ _ , doc_attributes = DocumentService .get_by_id (id )
732
+ doc_attributes = doc_attributes .to_dict ()
733
+ doc_id = doc_attributes ["id" ]
734
+
735
+ bucket , doc_name = File2DocumentService .get_minio_address (doc_id = doc_id )
736
+ binary = MINIO .get (bucket , doc_name )
737
+ parser_name = doc_attributes ["parser_id" ]
738
+ if binary :
739
+ res = doc_parse (binary , doc_name , parser_name , tenant_id )
740
+ if res is False :
741
+ message += f"The parser id: { parser_name } of the document { doc_id } is not supported; "
742
+ else :
743
+ message += f"Empty data in the document: { doc_name } ; "
744
+ # failed in parsing
745
+ if doc_attributes ["status" ] == TaskStatus .FAIL .value :
746
+ message += f"Failed in parsing the document: { doc_id } ; "
747
+ return message
748
+ except Exception as e :
749
+ return construct_error_response (e )
596
750
# ----------------------------stop parsing-----------------------------------------------------
597
751
598
752
# ----------------------------show the status of the file-----------------------------------------------------
@@ -610,6 +764,3 @@ def download_document(dataset_id, document_id):
610
764
# ----------------------------get a specific chunk-----------------------------------------------------
611
765
612
766
# ----------------------------retrieval test-----------------------------------------------------
613
-
614
-
615
-
0 commit comments