Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add English semantic search of pipelines #3718

Merged
merged 8 commits into from
Nov 16, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions paddlenlp/transformers/ernie/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,45 @@ class ErniePretrainedModel(PretrainedModel):
"use_task_id": True,
"vocab_size": 40000
},
"rocketqav2-en-marco-cross-encoder": {
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 4,
"vocab_size": 30522,
"pad_token_id": 0,
},
"rocketqav2-en-marco-query-encoder": {
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 4,
"vocab_size": 30522,
"pad_token_id": 0,
},
"rocketqav2-en-marco-para-encoder": {
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 4,
"vocab_size": 30522,
"pad_token_id": 0,
},
}
resource_files_names = {"model_state": "model_state.pdparams"}
pretrained_resource_files_map = {
Expand Down Expand Up @@ -752,6 +791,12 @@ class ErniePretrainedModel(PretrainedModel):
"https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-micro-cross-encoder.pdparams",
"rocketqa-nano-cross-encoder":
"https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-nano-cross-encoder.pdparams",
"rocketqav2-en-marco-cross-encoder":
"https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_cross_encoder.pdparams",
"rocketqav2-en-marco-query-encoder":
"https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_query_encoder.pdparams",
"rocketqav2-en-marco-para-encoder":
"https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_para_encoder.pdparams",
}
}
base_model_prefix = "ernie"
Expand Down
20 changes: 19 additions & 1 deletion paddlenlp/transformers/ernie/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@
"rocketqa-medium-cross-encoder": 2048,
"rocketqa-mini-cross-encoder": 2048,
"rocketqa-micro-cross-encoder": 2048,
"rocketqa-nano-cross-encoder": 2048
"rocketqa-nano-cross-encoder": 2048,
"rocketqav2-en-marco-cross-encoder": 512,
"rocketqav2-en-marco-query-encoder": 512,
"rocketqav2-en-marco-para-encoder": 512,
}


Expand Down Expand Up @@ -202,6 +205,12 @@ class ErnieTokenizer(PretrainedTokenizer):
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
"rocketqa-nano-cross-encoder":
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
"rocketqav2-en-marco-cross-encoder":
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
"rocketqav2-en-marco-query-encoder":
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
"rocketqav2-en-marco-para-encoder":
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
}
}
pretrained_init_configuration = {
Expand Down Expand Up @@ -325,6 +334,15 @@ class ErnieTokenizer(PretrainedTokenizer):
"rocketqa-nano-cross-encoder": {
"do_lower_case": True
},
"rocketqav2-en-marco-cross-encoder": {
"do_lower_case": True
},
"rocketqav2-en-marco-query-encoder": {
"do_lower_case": True
},
"rocketqav2-en-marco-para-encoder": {
"do_lower_case": True
},
}

max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
Expand Down
37 changes: 35 additions & 2 deletions paddlenlp/transformers/semantic_search/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(self,
title_model_name_or_path=None,
share_parameters=False,
dropout=None,
reinitialize=False,
use_cross_batch=False):

super().__init__()
Expand All @@ -96,6 +97,15 @@ def __init__(self,
assert (self.query_ernie is not None) or (self.title_ernie is not None), \
"At least one of query_ernie and title_ernie should not be None"

# Compatible to rocketv2 initialization for setting layer._epsilon to 1e-5
if reinitialize:
self.apply(self.init_weights)

def init_weights(self, layer):
""" Initialization hook """
if isinstance(layer, nn.LayerNorm):
layer._epsilon = 1e-5

def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
Expand Down Expand Up @@ -200,7 +210,6 @@ def forward(self,
paddle.distributed.all_gather(tensor_list, all_title_cls_embedding)
all_title_cls_embedding = paddle.concat(x=tensor_list, axis=0)

# multiply
logits = paddle.matmul(query_cls_embedding,
all_title_cls_embedding,
transpose_y=True)
Expand Down Expand Up @@ -242,9 +251,20 @@ class ErnieCrossEncoder(nn.Layer):
def __init__(self,
pretrain_model_name_or_path,
num_classes=2,
reinitialize=False,
dropout=None):
super().__init__()
self.ernie = ErnieEncoder.from_pretrained(pretrain_model_name_or_path)

self.ernie = ErnieEncoder.from_pretrained(pretrain_model_name_or_path,
num_classes=num_classes)
# Compatible to rocketv2 initialization for setting layer._epsilon to 1e-5
if reinitialize:
self.apply(self.init_weights)

def init_weights(self, layer):
""" Initialization hook """
if isinstance(layer, nn.LayerNorm):
layer._epsilon = 1e-5

def matching(self,
input_ids,
Expand All @@ -263,6 +283,19 @@ def matching(self,
return probs
return probs[:, 1]

def matching_v2(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
sequence_output, _ = self.ernie(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask)
pooled_output = self.ernie.dropout(sequence_output[:, 0])
probs = self.ernie.classifier(pooled_output)
return probs

def forward(self,
input_ids,
token_type_ids=None,
Expand Down
4 changes: 2 additions & 2 deletions pipelines/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
| rocketqa-zh-mini-query-encoder | Chinese | 6-layer, 384-hidden, 12-heads, 27M parameters. Trained on DuReader retrieval text. |
| rocketqa-zh-micro-query-encoder | Chinese | 4-layer, 384-hidden, 12-heads, 23M parameters. Trained on DuReader retrieval text. |
| rocketqa-zh-nano-query-encoder | Chinese | 4-layer, 312-hidden, 12-heads, 18M parameters. Trained on DuReader retrieval text. |

| rocketqav2-en-marco-query-encoder | English | 12-layer, 768-hidden, 12-heads, 118M parameters. Trained on MSMARCO. |

## ErnieRanker

Expand All @@ -26,7 +26,7 @@
| rocketqa-mini-cross-encoder | Chinese | 6-layer, 384-hidden, 12-heads, 27M parameters. Trained on DuReader retrieval text. |
| rocketqa-micro-cross-encoder | Chinese | 4-layer, 384-hidden, 12-heads, 23M parameters. Trained on DuReader retrieval text. |
| rocketqa-nano-cross-encoder | Chinese | 4-layer, 312-hidden, 12-heads, 18M parameters. Trained on DuReader retrieval text. |

| rocketqav2-en-marco-cross-encoder | English | 12-layer, 768-hidden, 12-heads, 118M parameters. Trained on Trained on MSMARCO. |

## ErnieReader

Expand Down
16 changes: 16 additions & 0 deletions pipelines/examples/semantic-search/run_search_server.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 指定语义检索系统的Yaml配置文件
export CUDA_VISIBLE_DEVICES=0
export PIPELINE_YAML_PATH=rest_api/pipeline/semantic_search.yaml
# English Version
# export PIPELINE_YAML_PATH=rest_api/pipeline/semantic_search_en.yaml
# 使用端口号 8891 启动模型服务
python rest_api/application.py 8891
21 changes: 20 additions & 1 deletion pipelines/examples/semantic-search/run_search_web.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

unset http_proxy && unset https_proxy
# 配置模型服务地址
export API_ENDPOINT=http://127.0.0.1:8891
# 在指定端口 8502 启动 WebUI
python -m streamlit run ui/webapp_semantic_search.py --server.port 8502
python -m streamlit run ui/webapp_semantic_search.py --server.port 8502

# English Version
# export EVAL_FILE=ui/country_search.csv
# export DEFAULT_QUESTION_AT_STARTUP="The introduction of United States of America?"
# python -m streamlit run ui/webapp_semantic_search.py --server.port 8502
18 changes: 14 additions & 4 deletions pipelines/pipelines/nodes/ranker/ernie_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def __init__(
max_seq_len: int = 256,
progress_bar: bool = True,
batch_size: int = 1000,
reinitialize: bool = False,
use_en: bool = False,
):
"""
:param model_name_or_path: Directory of a saved model or the name of a public model e.g.
Expand All @@ -60,14 +62,18 @@ def __init__(
self.set_config(
model_name_or_path=model_name_or_path,
top_k=top_k,
use_en=use_en,
)

self.top_k = top_k
# Parameter to control the use of English Cross Encoder Model
self.use_en = use_en

self.devices, _ = initialize_device_settings(use_cuda=use_gpu,
multi_gpu=True)
print("Loading Parameters from:{}".format(model_name_or_path))
self.transformer_model = ErnieCrossEncoder(model_name_or_path)
self.transformer_model = ErnieCrossEncoder(model_name_or_path,
reinitialize=reinitialize)
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.transformer_model.eval()
self.progress_bar = progress_bar
Expand Down Expand Up @@ -156,15 +162,19 @@ def predict_batch(
for cur_queries, cur_docs in batches:
features = self.tokenizer(cur_queries,
[doc.content for doc in cur_docs],
max_seq_len=256,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=True,
truncation_strategy="longest_first")

tensors = {k: paddle.to_tensor(v) for (k, v) in features.items()}

with paddle.no_grad():
similarity_scores = self.transformer_model.matching(
**tensors).numpy()
if (self.use_en):
similarity_scores = self.transformer_model.matching_v2(
**tensors).numpy()
else:
similarity_scores = self.transformer_model.matching(
**tensors).numpy()
preds.extend(similarity_scores)

for doc, rank_score in zip(cur_docs, similarity_scores):
Expand Down
2 changes: 1 addition & 1 deletion pipelines/rest_api/pipeline/dense_faq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ components: # define all the building-blocks for Pipeline
type: FileTypeClassifier

pipelines:
- name: query # a sample extractive-qa Pipeline
- name: query
type: Query
nodes:
- name: Retriever
Expand Down
2 changes: 1 addition & 1 deletion pipelines/rest_api/pipeline/semantic_search.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ components: # define all the building-blocks for Pipeline
type: FileTypeClassifier

pipelines:
- name: query # a sample extractive-qa Pipeline
- name: query
type: Query
nodes:
- name: Retriever
Expand Down
68 changes: 68 additions & 0 deletions pipelines/rest_api/pipeline/semantic_search_en.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
version: '1.1.0'

components: # define all the building-blocks for Pipeline
- name: DocumentStore
type: ElasticsearchDocumentStore # consider using Milvus2DocumentStore or WeaviateDocumentStore for scaling to large number of documents
params:
host: localhost
port: 9200
index: msmarco_query_encoder
embedding_dim: 768
- name: Retriever
type: DensePassageRetriever
params:
document_store: DocumentStore # params can reference other components defined in the YAML
top_k: 10
query_embedding_model: rocketqav2-en-marco-query-encoder
passage_embedding_model: rocketqav2-en-marco-query-encoder
embed_title: False
- name: Ranker # custom-name for the component; helpful for visualization & debugging
type: ErnieRanker # pipelines Class name for the component
params:
model_name_or_path: rocketqav2-en-marco-cross-encoder
top_k: 3
use_en: True,
reinitialize: True
- name: TextFileConverter
type: TextConverter
- name: ImageFileConverter
type: ImageToTextConverter
- name: PDFFileConverter
type: PDFToTextConverter
- name: DocxFileConverter
type: DocxToTextConverter
- name: Preprocessor
type: PreProcessor
params:
split_by: word
split_length: 1000
- name: FileTypeClassifier
type: FileTypeClassifier

pipelines:
- name: query
type: Query
nodes:
- name: Retriever
inputs: [Query]
- name: Ranker
inputs: [Retriever]
- name: indexing
type: Indexing
nodes:
- name: FileTypeClassifier
inputs: [File]
- name: TextFileConverter
inputs: [FileTypeClassifier.output_1]
- name: PDFFileConverter
inputs: [FileTypeClassifier.output_2]
- name: DocxFileConverter
inputs: [FileTypeClassifier.output_4]
- name: ImageFileConverter
inputs: [FileTypeClassifier.output_6]
- name: Preprocessor
inputs: [PDFFileConverter, TextFileConverter, DocxFileConverter, ImageFileConverter]
- name: Retriever
inputs: [Preprocessor]
- name: DocumentStore
inputs: [Retriever]
3 changes: 3 additions & 0 deletions pipelines/ui/country_search.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"Question Text";"Answer"
"What is the capital of America?";"Washington"
"How many states of the United States ?";"50"
Loading