Skip to content

Commit

Permalink
Merge pull request #139 from netease-youdao/develop_for_v1.2.2
Browse files Browse the repository at this point in the history
Optimizing the parsing logic for csv and xlsx files. In addition, offline running of Docker images is now supported.
  • Loading branch information
xixihahaliu authored Feb 26, 2024
2 parents feba5fb + dc9234f commit aaf1bc3
Show file tree
Hide file tree
Showing 10 changed files with 432 additions and 375 deletions.
5 changes: 3 additions & 2 deletions docker-compose-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ services:

qanything_local:
container_name: qanything-container-local
image: freeren/qanything:v1.1.1
image: freeren/qanything:v1.2.1
# runtime: nvidia
deploy:
resources:
Expand All @@ -93,7 +93,6 @@ services:
privileged: true
shm_size: '8gb'
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/models:/model_repos/QAEnsemble
- ${DOCKER_VOLUME_DIRECTORY:-.}/assets/custom_models:/model_repos/CustomLLM
- ${DOCKER_VOLUME_DIRECTORY:-.}/:/workspace/qanything_local/
ports:
Expand All @@ -103,6 +102,8 @@ services:
- NCCL_LAUNCH_MODE=PARALLEL
- GPUID1=${GPUID1:-0}
- GPUID2=${GPUID2:-0}
- MODEL_SIZE=${MODEL_SIZE:-0B}
- USER_IP=${USER_IP:-localhost}
depends_on:
- "standalone"
- "mysql"
Expand Down
5 changes: 3 additions & 2 deletions docker-compose-windows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ services:

qanything_local:
container_name: qanything-container-local
image: freeren/qanything-win:v1.1.1
image: freeren/qanything-win:v1.2.1
# runtime: nvidia
deploy:
resources:
Expand All @@ -91,7 +91,6 @@ services:
privileged: true
shm_size: '8gb'
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/models:/model_repos/QAEnsemble
- ${DOCKER_VOLUME_DIRECTORY:-.}/assets/custom_models:/model_repos/CustomLLM
- ${DOCKER_VOLUME_DIRECTORY:-.}/:/workspace/qanything_local/
ports:
Expand All @@ -101,6 +100,8 @@ services:
- NCCL_LAUNCH_MODE=PARALLEL
- GPUID1=${GPUID1:-0}
- GPUID2=${GPUID2:-0}
- MODEL_SIZE=${MODEL_SIZE:-0B}
- USER_IP=${USER_IP:-localhost}
depends_on:
- "standalone"
- "mysql"
Expand Down
13 changes: 11 additions & 2 deletions qanything_kernel/connector/database/milvus/milvus_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,16 @@ def parse_batch_result(self, batch_result):
"file_name": cand.entity.get('file_name'),
"chunk_id": cand.entity.get('chunk_id')})
new_cands.append(doc)
new_cands = self.expand_cand_docs(new_cands)
# csv和xlsx文件不做expand_cand_docs
need_expand, not_need_expand = [], []
for doc in new_cands:
if doc.metadata['file_name'].lower().split('.')[-1] in ['csv', 'xlsx']:
doc.metadata["kernel"] = doc.page_content
not_need_expand.append(doc)
else:
need_expand.append(doc)
expand_res = self.expand_cand_docs(need_expand)
new_cands = not_need_expand + expand_res
new_result.append(new_cands)
return new_result

Expand Down Expand Up @@ -194,6 +203,7 @@ def seperate_list(self, ls: List[int]) -> List[List[int]]:

def process_group(self, group):
new_cands = []
# 对每个分组按照chunk_id进行排序
group.sort(key=lambda x: int(x.metadata['chunk_id'].split('_')[-1]))
id_set = set()
file_id = group[0].metadata['file_id']
Expand Down Expand Up @@ -256,7 +266,6 @@ def expand_cand_docs(self, cand_docs):

with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
# 对每个分组按照chunk_id进行排序
for group in m_grouped:
if not group:
continue
Expand Down
6 changes: 5 additions & 1 deletion qanything_kernel/connector/embedding/embedding_for_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

from qanything_kernel.connector.embedding.embedding_client import EmbeddingClient
from qanything_kernel.configs.model_config import LOCAL_EMBED_SERVICE_URL, LOCAL_EMBED_MODEL_NAME, LOCAL_EMBED_MAX_LENGTH, LOCAL_EMBED_BATCH
from qanything_kernel.utils.custom_log import debug_logger
import concurrent.futures
from tqdm import tqdm

embedding_client = EmbeddingClient(
server_url=LOCAL_EMBED_SERVICE_URL,
Expand All @@ -24,13 +26,15 @@ def _get_embedding(self, queries):
def _get_len_safe_embeddings(self, texts: List[str]) -> List[List[float]]:
all_embeddings = []
batch_size = LOCAL_EMBED_BATCH

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
future = executor.submit(self._get_embedding, batch)
futures.append(future)
for future in futures:
debug_logger.info(f'embedding number: {len(futures)}')
for future in tqdm(futures):
embeddings = future.result()
all_embeddings += embeddings
return all_embeddings
Expand Down
7 changes: 6 additions & 1 deletion qanything_kernel/core/local_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from qanything_kernel.utils.loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader
from qanything_kernel.utils.splitter import zh_title_enhance
from sanic.request import File
import pandas as pd
import os

text_splitter = RecursiveCharacterTextSplitter(
Expand Down Expand Up @@ -82,7 +83,11 @@ def split_file_to_docs(self, ocr_engine: Callable, sentence_size=SENTENCE_SIZE,
texts_splitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(texts_splitter)
elif self.file_path.lower().endswith(".xlsx"):
loader = UnstructuredExcelLoader(self.file_path, mode="elements")
# loader = UnstructuredExcelLoader(self.file_path, mode="elements")
csv_file_path = self.file_path[:-5] + '.csv'
xlsx = pd.read_excel(self.file_path, engine='openpyxl')
xlsx.to_csv(csv_file_path, index=False)
loader = CSVLoader(csv_file_path, csv_args={"delimiter": ",", "quotechar": '"'})
docs = loader.load()
elif self.file_path.lower().endswith(".pptx"):
loader = UnstructuredPowerPointLoader(self.file_path, mode="elements")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tritonclient import grpc as grpcclient
from qanything_kernel.configs.model_config import LOCAL_RERANK_SERVICE_URL, LOCAL_RERANK_MAX_LENGTH, LOCAL_RERANK_MODEL_NAME, \
LOCAL_RERANK_BATCH
import numpy as np


class LocalRerankBackend:
Expand Down Expand Up @@ -42,7 +43,10 @@ def inference(self, serialized_inputs):
result_data = response.as_numpy(output_name)
print('rerank res:', result_data, flush=True)

return result_data.reshape(-1).tolist()
# 应用sigmoid函数
sigmoid_scores = 1 / (1 + np.exp(-result_data))

return sigmoid_scores.reshape(-1).tolist()

def merge_inputs(self, chunk1_raw, chunk2):
chunk1 = deepcopy(chunk1_raw)
Expand Down
21 changes: 16 additions & 5 deletions qanything_kernel/utils/loader/csv_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def load(self) -> List[Document]:
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
# 初始化一个字典,用于存储每一列最后一次的非空值
last_non_empty_values = {}
for i, row in enumerate(csv_reader):
try:
source = (
Expand All @@ -97,12 +99,21 @@ def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
raise ValueError(
f"Source column '{self.source_column}' not found in CSV file."
)

line_contents = []
for k, v in row.items():
if k in self.metadata_columns:
continue
line_contents.append(f"{k.strip()}: {v.strip() if v else last_non_empty_values.get(k, v)}")
if v:
last_non_empty_values[k] = v
content = '------------------------\n'
content += " & ".join(
f"{k.strip()}: {v.strip() if v is not None else v}"
for k, v in row.items()
if k not in self.metadata_columns
)
# content += " & ".join(
# f"{k.strip()}: {v.strip() if v is not None else v}"
# for k, v in row.items()
# if k not in self.metadata_columns
# )
content += ' & '.join(line_contents)
content += '\n------------------------'

metadata = {"source": source, "row": i}
Expand Down
Loading

0 comments on commit aaf1bc3

Please sign in to comment.