Skip to content

Commit

Permalink
0517 list chunks (infiniflow#821)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

infiniflow#717 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
guoyuhao2330 committed May 17, 2024
1 parent 9f0f5b4 commit 081f922
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 0 deletions.
43 changes: 43 additions & 0 deletions api/apps/api_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
from api.utils.file_utils import filename_type, thumbnail
from rag.utils.minio_conn import MINIO

from rag.utils.es_conn import ELASTICSEARCH
from rag.nlp import search
from elasticsearch_dsl import Q

def generate_confirmation_token(tenent_id):
serializer = URLSafeTimedSerializer(tenent_id)
Expand Down Expand Up @@ -347,3 +350,43 @@ def upload():
return server_error_response(e)

return get_json_result(data=doc_result.to_json())


@manager.route('/list_chunks', methods=['POST'])
# @login_required
def list_chunks():
token = request.headers.get('Authorization').split()[1]
objs = APIToken.query(token=token)
if not objs:
return get_json_result(
data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)

form_data = request.form

try:
if "doc_name" in form_data.keys():
tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name'])
q = Q("match", docnm_kwd=form_data['doc_name'])

elif "doc_id" in form_data.keys():
tenant_id = DocumentService.get_tenant_id(form_data['doc_id'])
q = Q("match", doc_id=form_data['doc_id'])
else:
return get_json_result(
data=False,retmsg="Can't find doc_name or doc_id"
)

res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s")

res = [{} for _ in range(len(res_es_search['hits']['hits']))]

for index , chunk in enumerate(res_es_search['hits']['hits']):
res[index]['doc_name'] = chunk['_source']['docnm_kwd']
res[index]['content'] = chunk['_source']['content_with_weight']
if 'img_id' in chunk['_source'].keys():
res[index]['img_id'] = chunk['_source']['img_id']

except Exception as e:
return server_error_response(e)

return get_json_result(data=res)
13 changes: 13 additions & 0 deletions api/db/services/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,19 @@ def get_tenant_id(cls, doc_id):
return
return docs[0]["tenant_id"]

@classmethod
@DB.connection_context()
def get_tenant_id_by_name(cls, name):
docs = cls.model.select(
Knowledgebase.tenant_id).join(
Knowledgebase, on=(
Knowledgebase.id == cls.model.kb_id)).where(
cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
return
return docs[0]["tenant_id"]

@classmethod
@DB.connection_context()
def get_thumbnails(cls, docids):
Expand Down
35 changes: 35 additions & 0 deletions docs/conversation_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,3 +364,38 @@ This is usually used when upload a file to.
}

```

## Get document chunks

Get the chunks of the document based on doc_name or doc_id.
### Path: /api/list_chunks/
### Method: POST

### Parameter:

| Name | Type | Optional | Description |
|----------|--------|----------|---------------------------------|
| `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.|
| `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.|


### Response
```json
{
"data": [
{
"content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K",
"doc_name": "RL-Cache.pdf",
"img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211"
},
{
"content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES",
"doc_name": "RL-Cache.pdf",
"img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7"
}
],
"retcode": 0,
"retmsg": "success"
}

```

0 comments on commit 081f922

Please sign in to comment.