diff --git "a/content/Large Language Models with Semantic Search/5.\351\207\215\346\216\222 Rerank.ipynb" "b/content/Large Language Models with Semantic Search/5.\351\207\215\346\216\222 Rerank.ipynb" index bb2734a4..3a96f73d 100644 --- "a/content/Large Language Models with Semantic Search/5.\351\207\215\346\216\222 Rerank.ipynb" +++ "b/content/Large Language Models with Semantic Search/5.\351\207\215\346\216\222 Rerank.ipynb" @@ -1 +1 @@ -{"cells": [{"cell_type": "markdown", "id": "0ead654a", "metadata": {}, "source": ["# \u7b2c\u4e94\u7ae0\u3001\u91cd\u6392\n", "\n", " - [\u4e00\u3001\u914d\u7f6e\u73af\u5883](#\u4e00\u3001\u914d\u7f6e\u73af\u5883)\n", " - [\u4e8c\u3001\u7a20\u5bc6\u68c0\u7d22](#\u4e8c\u3001\u7a20\u5bc6\u68c0\u7d22)\n", " - [2.1 \u7a20\u5bc6\u68c0\u7d22\u7684\u4e0d\u8db3](#2.1-\u7a20\u5bc6\u68c0\u7d22\u7684\u4e0d\u8db3)\n", " - [2.2 \u91cd\u6392\u6a21\u578b\u7684\u8bad\u7ec3\u65b9\u5f0f](#2.2-\u91cd\u6392\u6a21\u578b\u7684\u8bad\u7ec3\u65b9\u5f0f)\n", " - [\u4e09\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u5173\u952e\u8bcd\u68c0\u7d22](#\u4e09\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u5173\u952e\u8bcd\u68c0\u7d22)\n", " - [\u56db\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u7a20\u5bc6\u68c0\u7d22](#\u56db\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u7a20\u5bc6\u68c0\u7d22)\n", " - [4.1 \u8fdb\u4e00\u6b65\u7406\u89e3\u91cd\u6392](#4.1-\u8fdb\u4e00\u6b65\u7406\u89e3\u91cd\u6392)\n", " - [4.2 \u641c\u7d22\u7cfb\u7edf\u7684\u8bc4\u4f30](#4.2-\u641c\u7d22\u7cfb\u7edf\u7684\u8bc4\u4f30)\n"]}, {"cell_type": "markdown", "id": "7673e576", "metadata": {}, "source": ["\u91cd\u6392\uff08Rerank\uff09\u662f\u4e00\u79cd\u4f18\u5316\u5173\u952e\u8bcd\u68c0\u7d22\u548c\u7a20\u5bc6\u68c0\u7d22\u7684\u65b9\u6cd5\u3002\u5b83\u662f\u8bed\u4e49\u641c\u7d22\u9664\u4e86\u7a20\u5bc6\u68c0\u7d22\u5916\u7684\u91cd\u8981\u7ec4\u6210\u90e8\u5206\u3002Rerank \u662f\u4e00\u79cd\u8ba9\u5927\u578b\u8bed\u8a00\u6a21\u578b\u6309\u7167\u4e0e\u67e5\u8be2\u76f8\u5173\u6027\u5bf9\u641c\u7d22\u7ed3\u679c\u4ece\u9ad8\u5230\u4f4e\u6392\u5e8f\u7684\u65b9\u6cd5\u3002"]}, {"cell_type": "markdown", "id": "99f6a6f7", "metadata": {}, "source": ["## \u4e00\u3001\u914d\u7f6e\u73af\u5883\n", "\n", "\u8ba9\u6211\u4eec\u5148\u51c6\u5907\u597d\u9700\u8981\u7528\u5230\u7684\u4e00\u4e9b Python \u5e93\u548c API\uff1a"]}, {"cell_type": "code", "execution_count": null, "id": "f350cd1b", "metadata": {}, "outputs": [], "source": ["!pip install cohere \n", "!pip install weaviate-client"]}, {"cell_type": "code", "execution_count": 3, "id": "b2febbb9-27dd-4209-838a-99b4f9cdf51b", "metadata": {}, "outputs": [], "source": ["import os\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) # \u8bfb\u53d6\u672c\u5730 .env \u6587\u4ef6"]}, {"cell_type": "code", "execution_count": 4, "id": "dab2ecba-3403-4317-86ef-bd6d92a6cb46", "metadata": {}, "outputs": [], "source": ["import cohere\n", "co = cohere.Client(os.environ['COHERE_API_KEY'])"]}, {"cell_type": "markdown", "id": "3944cd92", "metadata": {}, "source": ["\u521b\u5efa\u8fde\u63a5\u5b58\u50a8\u6240\u6709\u7ef4\u57fa\u767e\u79d1\u6761\u76ee\u6570\u636e\u5e93\u7684\u5ba2\u6237\u7aef\u3002"]}, {"cell_type": "code", "execution_count": 5, "id": "30737b1b-e4c8-4bd0-a04b-c2ce70d28821", "metadata": {}, "outputs": [], "source": ["import weaviate\n", "\n", "# \u8fde\u63a5\u5230\u5305\u542b 10M \u7ef4\u57fa\u767e\u79d1\u7684\u7528\u4e8e\u7f51\u7edc\u6f14\u793a\u7684\u5411\u91cf\u6570\u636e\u5e93\n", "# \u4f7f\u7528\u4e00\u4e2a\u516c\u5171\u7684\u62e5\u6709\u53ea\u8bfb\u6743\u9650\u7684API\u952e\n", "auth_config = weaviate.auth.AuthApiKey(\n", " api_key=os.environ['WEAVIATE_API_KEY']) # \"76320a90-53d8-42bc-b41d-678647c6672e\""]}, {"cell_type": "code", "execution_count": 6, "id": "8781f638-17c7-4ab7-86b5-3763d4d5abad", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/Users/zhihu123/Library/Python/3.9/lib/python/site-packages/weaviate/warnings.py:158: DeprecationWarning: Dep016: You are using the Weaviate v3 client, which is deprecated.\n", " Consider upgrading to the new and improved v4 client instead!\n", " See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python\n", " \n", " warnings.warn(\n"]}], "source": ["client = weaviate.Client(\n", " url=os.environ['WEAVIATE_API_URL'],\n", " auth_client_secret=auth_config,\n", " additional_headers={\n", " \"X-Cohere-Api-Key\": os.environ['COHERE_API_KEY'],\n", " }\n", ")"]}, {"cell_type": "markdown", "id": "ffcc8e5e", "metadata": {}, "source": ["## \u4e8c\u3001\u7a20\u5bc6\u68c0\u7d22"]}, {"cell_type": "markdown", "id": "2f678341", "metadata": {}, "source": ["### 2.1 \u7a20\u5bc6\u68c0\u7d22\u7684\u4e0d\u8db3"]}, {"cell_type": "markdown", "id": "9d6445d5", "metadata": {}, "source": ["\u9996\u5148\u6211\u4eec\u8c03\u7528\u4e0a\u8282\u8bfe\u7684 `dense_retrieval` \u51fd\u6570\uff0c\u67e5\u770b\u7a20\u5bc6\u68c0\u7d22\u7684\u7ed3\u679c"]}, {"cell_type": "code", "execution_count": 7, "id": "b8561fbf-035e-4856-a97f-8eda21d32a81", "metadata": {}, "outputs": [], "source": ["from utils import dense_retrieval"]}, {"cell_type": "code", "execution_count": 8, "id": "1822cc6c-ddc2-4938-b746-7cda2506d51e", "metadata": {}, "outputs": [], "source": ["from utils import print_result"]}, {"cell_type": "code", "execution_count": 9, "id": "a0b0830e", "metadata": {}, "outputs": [], "source": ["query_1 = \"What is the capital of Canada?\""]}, {"cell_type": "code", "execution_count": 10, "id": "09ba30f0", "metadata": {}, "outputs": [], "source": ["dense_retrieval_results = dense_retrieval(query_1, client)"]}, {"cell_type": "code", "execution_count": 11, "id": "2990c5c4-1b63-453e-8dd8-8568cb7872f5", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["item 0\n", "_additional:{'distance': -150.8031}\n", "\n", "lang:en\n", "\n", "text:The governor general of the province had designated Kingston as the capital in 1841. However, the major population centres of Toronto and Montreal, as well as the former capital of Lower Canada, Quebec City, all had legislators dissatisfied with Kingston. Anglophone merchants in Quebec were the main group supportive of the Kingston arrangement. In 1842, a vote rejected Kingston as the capital, and study of potential candidates included the then-named Bytown, but that option proved less popular than Toronto or Montreal. In 1843, a report of the Executive Council recommended Montreal as the capital as a more fortifiable location and commercial centre, however, the Governor General refused to execute a move without a parliamentary vote. In 1844, the Queen's acceptance of a parliamentary vote moved the capital to Montreal.\n", "\n", "title:Ottawa\n", "\n", "url:https://en.wikipedia.org/wiki?curid=22219\n", "\n", "views:2000\n", "\n", "\n", "item 1\n", "_additional:{'distance': -150.28354}\n", "\n", "lang:en\n", "\n", "text:For brief periods, Toronto was twice the capital of the united Province of Canada: first from 1849 to 1852, following unrest in Montreal, and later 1856\u20131858. After this date, Quebec was designated as the capital until 1866 (one year before Canadian Confederation). Since then, the capital of Canada has remained Ottawa, Ontario.\n", "\n", "title:Toronto\n", "\n", "url:https://en.wikipedia.org/wiki?curid=64646\n", "\n", "views:3000\n", "\n", "\n", "item 2\n", "_additional:{'distance': -150.02524}\n", "\n", "lang:en\n", "\n", "text:Selection of Ottawa as the capital of Canada predates the Confederation of Canada. The selection was contentious and not straightforward, with the parliament of the United Province of Canada holding more than 200 votes over several decades to attempt to settle on a legislative solution to the location of the capital.\n", "\n", "title:Ottawa\n", "\n", "url:https://en.wikipedia.org/wiki?curid=22219\n", "\n", "views:2000\n", "\n", "\n", "item 3\n", "_additional:{'distance': -149.92365}\n", "\n", "lang:en\n", "\n", "text:Until the late 18th century Qu\u00e9bec was the most populous city in present-day Canada. As of the census of 1790, Montreal surpassed it with 18,000 inhabitants, but Quebec (pop. 14,000) remained the administrative capital of New France. It was then made the capital of Lower Canada by the Constitutional Act of 1791. From 1841 to 1867, the capital of the Province of Canada rotated between Kingston, Montreal, Toronto, Ottawa and Quebec City (from 1852 to 1856 and from 1859 to 1866).\n", "\n", "title:Quebec City\n", "\n", "url:https://en.wikipedia.org/wiki?curid=100727\n", "\n", "views:2000\n", "\n", "\n", "item 4\n", "_additional:{'distance': -149.71033}\n", "\n", "lang:en\n", "\n", "text:The Quebec Conference on Canadian Confederation was held in the city in 1864. In 1867, Queen Victoria chose Ottawa as the definite capital of the Dominion of Canada, while Quebec City was confirmed as the capital of the newly created province of Quebec.\n", "\n", "title:Quebec City\n", "\n", "url:https://en.wikipedia.org/wiki?curid=100727\n", "\n", "views:2000\n", "\n", "\n"]}], "source": ["print_result(dense_retrieval_results)"]}, {"cell_type": "markdown", "id": "385d45ad", "metadata": {}, "source": ["\u6ce8\uff1a\u7ecf\u8fc7\u6d4b\u8bd5\uff0c\u53d1\u73b0\u5f53\u524d\u6570\u636e\u5e93\u4e2d\u6587\u9884\u6599\u53ef\u80fd\u8f83\u5c11\uff0c\u5bf9\u4e2d\u6587\u68c0\u7d22\u6bd4\u8f83\u7b80\u5355\uff0c\u6240\u4ee5\u5bf9\u67e5\u8be2\uff08query\uff09\u8fdb\u884c\u4e86\u7b80\u5316\u3002(\u4f8b\u5982\u53ea\u4fdd\u7559\u5173\u952e\u8bcd\uff0c\u7c7b\u4f3c\u4e3b\u8bed)"]}, {"cell_type": "code", "execution_count": 134, "id": "15694a5c-3525-49cc-b5e9-d1c34ae0fbe9", "metadata": {}, "outputs": [], "source": ["query_1 = \"\u52a0\u62ff\u5927\u9996\u90fd\""]}, {"cell_type": "code", "execution_count": 135, "id": "6dfede25-8a43-41c9-9328-d331695c4fcb", "metadata": {}, "outputs": [], "source": ["dense_retrieval_results = dense_retrieval(query_1, client, 'zh')"]}, {"cell_type": "code", "execution_count": 136, "id": "1bb57590", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["item 0\n", "_additional:{'distance': -152.25616}\n", "\n", "lang:zh\n", "\n", "text:18\u4e16\u7eaa\u665a\u671f\u4e4b\u524d\uff0c\u9b41\u5317\u514b\u57ce\u4e00\u76f4\u662f\u52a0\u62ff\u5927\u4eba\u53e3\u6700\u591a\u7684\u57ce\u5e02\u3002\u57281790\u5e74\u7684\u666e\u67e5\u671f\u95f4\uff0c\u8499\u7279\u5229\u5c14\u4ee518,000\u5c45\u6c11\u8d85\u8fc7\u4e86\u9b41\u5317\u514b\uff0c\u4f46\u9b41\u5317\u514b\uff0814,000\u4eba\u53e3\uff09\u4f9d\u7136\u4fdd\u4f4f\u4e86\u65b0\u6cd5\u5170\u897f\u884c\u653f\u9996\u5e9c\u7684\u5730\u4f4d\u3002\u57281791\u5e74\u5baa\u6cd5\u4e2d\uff0c\u9b41\u5317\u514b\u57ce\u6210\u4e3a\u4e0b\u52a0\u62ff\u5927\u7684\u9996\u5e9c\u3002\u4ece1841\u5e74\u52301867\u5e74\uff0c\u52a0\u62ff\u5927\u7701\u7684\u9996\u5e9c\u5728\u51e0\u4e2a\u57ce\u5e02\u4e4b\u95f4\u8f6e\u66ff\uff0c\u5305\u62ec\u91d1\u58eb\u987f\uff0c\u8499\u7279\u5229\u5c14\uff0c\u591a\u4f26\u591a\uff0c\u6e25\u592a\u534e\u548c\u9b41\u5317\u514b\uff081852\u5e74\u52301856\u5e74\uff0c1859\u5e74\u52301866\u5e74\uff09\u3002\n", "\n", "title:\u9b41\u5317\u514b\u5e02\n", "\n", "url:https://zh.wikipedia.org/wiki?curid=117192\n", "\n", "views:600\n", "\n", "\n", "item 1\n", "_additional:{'distance': -150.94444}\n", "\n", "lang:zh\n", "\n", "text:\u6e25\u592a\u83ef\uff08\uff09\u662f\u52a0\u62ff\u5927\u7684\u806f\u90a6\u9996\u90fd\uff0c\u5168\u570b\u7b2c\u56db\u5927\u57ce\u5e02\uff0c\u5e02\u5340\u4eba\u53e3\u662f934,243\u4eba\uff0c\u9996\u90fd\u5708\u5730\u5340\u662f1,323,783\u4eba\uff08\u6839\u64da2016\u5e74\u4eba\u53e3\u666e\u67e5\uff09\uff0c\u9762\u7a4d2,779\u5e73\u65b9\u516c\u91cc\uff0c\u4f4d\u65bc\u5b89\u5927\u7565\u7701\u6771\u5357\u90e8\uff0c\u6e25\u592a\u83ef\u6cb3\u5357\u5cb8\uff0c\u591a\u502b\u591a\u4ee5\u6771400\u516c\u91cc\uff0c\u8499\u7279\u5229\u723e\u4ee5\u897f190\u516c\u91cc\u3002\u8207\u7f8e\u570b\u3001\u6fb3\u5927\u5229\u4e9e\u7b49\u570b\u4e0d\u540c\uff0c\u6e25\u592a\u83ef\u4e0d\u662f\u806f\u90a6\u76f4\u8f44\u7684\u884c\u653f\u5340\uff0c\u4f46\u662f\u6e25\u592a\u83ef\u571f\u5730\u7ba1\u7406\u548c\u57ce\u5e02\u898f\u5283\u662f\u7531\u570b\u5bb6\u9996\u90fd\u59d4\u54e1\u6703\uff08National Capital Commission\uff09\u8ca0\u8cac\u3002\n", "\n", "title:\u6e25\u592a\u83ef\n", "\n", "url:https://zh.wikipedia.org/wiki?curid=70236\n", "\n", "views:800\n", "\n", "\n", "item 2\n", "_additional:{'distance': -150.90271}\n", "\n", "lang:zh\n", "\n", "text:1857\u5e7412\u670831\u65e5\uff0c\u7ef4\u591a\u5229\u4e9a\u5973\u738b\u9009\u62e9\u6e25\u592a\u534e\u4e3a\u52a0\u62ff\u5927\u7701\u7684\u9996\u90fd\uff08\u5305\u62ec\u73b0\u5728\u7684\u5b89\u5927\u7565\u548c\u9b41\u5317\u514b\uff09\u3002\u867d\u7136\u73b0\u4ee3\u7684\u6e25\u592a\u534e\u662f\u52a0\u62ff\u5927\u7b2c\u56db\u5927\u57ce\u5e02\uff0c\u4f46\u5728\u5f53\u5e74\uff0c\u5979\u4ec5\u4ec5\u662f\u4e00\u4e2a\u6728\u6750\u8d38\u6613\u901a\u9053\u4e2d\u7684\u5185\u9646\u5c0f\u9547\uff0c\u5e76\u4e14\u8ddd\u79bb\u6b96\u6c11\u5730\u7684\u51e0\u4e2a\u4e3b\u8981\u57ce\u5e02\uff08\u4e1c\u90e8\u7684\u8499\u7279\u5229\u5c14\u548c\u9b41\u5317\u514b\u57ce\uff1b\u897f\u90e8\u7684\u591a\u4f26\u591a\u548c\u4eac\u58eb\u9813\uff09\u8def\u9014\u9065\u8fdc\u3002\u5973\u738b\u7684\u987e\u95ee\u4eec\u5efa\u8bae\u6e25\u592a\u534e\u6210\u4e3a\u9996\u90fd\u4e4b\u9009\u6709\u4e24\u5927\u91cd\u8981\u7406\u7531\uff1a\u9996\u5148\uff0c\u6e25\u592a\u534e\u662f\u552f\u4e00\u5177\u6709\u4e00\u5b9a\u89c4\u6a21\u3001\u5e76\u4e14\u4f4d\u4e8e\u52a0\u62ff\u5927\u7701\u4e1c\u897f\u90e8\u8fb9\u754c\u5730\uff08\u73b0\u5b89\u5927\u7565\u4e0e\u9b41\u5317\u514b\u8fb9\u754c\uff09\u7684\u57ce\u5e02\uff0c\u5b9a\u90fd\u4e8e\u6b64\u662f\u5e73\u8861\u4e24\u4e2a\u6b96\u6c11\u5730\u53ca\u5176\u82f1\u88d4\u3001\u6cd5\u88d4\u5c45\u6c11\u7684\u806a\u660e\u59a5\u534f\u4e4b\u4e3e\uff1b\u5176\u6b21\uff0c1812\u5e74\u6218\u4e89\u8868\u660e\uff0c\u5176\u4ed6\u4e3b\u8981\u57ce\u5e02\u5bb9\u6613\u53d7\u5230\u7f8e\u56fd\u4eba\u7684\u653b\u51fb\uff0c\u56e0\u4e3a\u8fc7\u4e8e\u9760\u8fd1\u7f8e\u52a0\u8fb9\u754c\u3002\u6e25\u592a\u534e\u4f4d\u4e8e\u8179\u5730\uff0c\u6613\u4e8e\u9632\u5b88\uff0c\u6e25\u592a\u534e\u6cb3\u53ca\u4e3d\u90fd\u8fd0\u6cb3\u4f7f\u4e4b\u4e0e\u52a0\u62ff\u5927\u4e1c\u897f\u90e8\u4e4b\u95f4\u4ea4\u901a\u6781\u4e3a\u4fbf\u5229\u3002\u53e6\u5916\u4e24\u4e2a\u65b9\u9762\u7684\u8003\u8651\u662f\uff1a\u6e25\u592a\u534e\u6b63\u597d\u4ecb\u4e8e\u591a\u4f26\u591a\u548c\u9b41\u5317\u514b\u57ce\u4e4b\u95f4\uff08\u8ddd\u79bb\u8fd9\u4e24\u4e2a\u57ce\u5e02\u90fd\u662f500\u516c\u91cc\uff09\uff0c\u5e76\u4e14\u57ce\u5e02\u89c4\u6a21\u8f83\u5c0f\uff0c\u56e0\u800c\u4e0d\u5bb9\u6613\u53d7\u5230\u5927\u89c4\u6a21\u7684\u66b4\u5f92\u88ad\u51fb\uff0c\u56e0\u4e3a\u653f\u6cbb\u52a8\u673a\uff0c\u4ee5\u5f80\u7684\u9996\u90fd\u57ce\u5e02\u90fd\u53d7\u5230\u8fc7\u8fd9\u79cd\u653b\u51fb\u3002\n", "\n", "title:\u6e25\u592a\u83ef\n", "\n", "url:https://zh.wikipedia.org/wiki?curid=70236\n", "\n", "views:800\n", "\n", "\n", "item 3\n", "_additional:{'distance': -150.68478}\n", "\n", "lang:zh\n", "\n", "text:\u591a\u4f26\u591a\uff08\uff0c\uff09\uff0c\u662f\u5317\u7f8e\u6d32\u56fd\u5bb6\u52a0\u62ff\u5927\u5b89\u5927\u7565\u7701\u9996\u5e9c\uff0c\u52a0\u62ff\u5927\u7684\u6700\u5927\u57ce\u5e02\u3002\u591a\u4f26\u591a\u5750\u843d\u5728\u5b89\u5927\u7565\u6e56\u897f\u5317\u5cb8\u7684\u5357\u5b89\u5927\u7565\u5730\u533a\u3002\u6839\u636e2021\u5e74\u7684\u52a0\u62ff\u5927\u4eba\u53e3\u666e\u67e5\uff0c\u591a\u4f26\u591a\u5e02\u4eba\u53e3\u8fbe2,794,356\u4eba\uff0c\u4e3a\u52a0\u62ff\u5927\u6700\u5927\u57ce\u5e02\u3002\u591a\u4f26\u591a\u5e02\u662f\u5927\u591a\u4f26\u591a\u5730\u533a\u7684\u5fc3\u810f\u5730\u533a\uff0c\u4e5f\u662f\u5b89\u5927\u7565\u7701\u5357\u90e8\u4eba\u53e3\u7a20\u5bc6\u533a\uff08\u79f0\u4f5c\u201c\u91d1\u99ac\u8e44\u5730\u5340\u201d\uff09\u7684\u4e00\u90e8\u5206\u3002\u90fd\u6703\u533a\u67096,202,225\u540d\u5c45\u6c11\uff0c\u800c\u8986\u84cb\u7bc4\u570d\u8f03\u5ee3\u7684\u5927\u591a\u502b\u591a\u5730\u5340\u5247\u67099,765,188\u540d\u5c45\u6c11\u3002\u4f5c\u70ba\u52a0\u62ff\u5927\u7684\u7ecf\u6d4e\u4e2d\u5fc3\uff0c\u591a\u4f26\u591a\u662f\u4e00\u500b\u4e16\u754c\u7ea7\u57ce\u5e02\uff0c\u4e5f\u662f\u4e16\u754c\u4e0a\u6700\u5927\u7684\u91d1\u878d\u4e2d\u5fc3\u4e4b\u4e00\u3002\u591a\u4f26\u591a\u5728\u7ecf\u6d4e\u4e0a\u7684\u9886\u5148\u5730\u4f4d\u5728\u4e8e\u91d1\u878d\u3001\u5546\u4e1a\u670d\u52a1\u3001\u7535\u4fe1\u3001\u822a\u592a\u3001\u4ea4\u901a\u8fd0\u8f93\u3001\u5a92\u4f53\u3001\u827a\u672f\u3001\u7535\u5f71\u3001\u7535\u89c6\u88fd\u4f5c\u3001\u51fa\u7248\u3001\u8f6f\u4ef6\u3001\u533b\u836f\u7814\u7a76\u3001\u6559\u80b2\u3001\u65c5\u6e38\u3001\u4f53\u80b2\u7b49\u4ea7\u4e1a\u3002\u591a\u4f26\u591a\u8bc1\u5238\u4ea4\u6613\u6240\u662f\u4e16\u754c\u7b2c\u4e03\u5927\u4ea4\u6613\u6240\uff0c\u603b\u90e8\u8bbe\u4e8e\u5e02\u5185\uff0c\u6709\u591a\u6570\u52a0\u62ff\u5927\u516c\u53f8\u5728\u8fd9\u91cc\u4e0a\u5e02\u3002\n", "\n", "title:\u591a\u4f26\u591a\n", "\n", "url:https://zh.wikipedia.org/wiki?curid=3132\n", "\n", "views:1000\n", "\n", "\n", "item 4\n", "_additional:{'distance': -150.47894}\n", "\n", "lang:zh\n", "\n", "text:\u8499\u7279\u5a41\u66fe\u7ecf\u662f\u52a0\u62ff\u5927\u7ecf\u6d4e\u9996\u90fd\uff0c\u62e5\u6709\u6700\u591a\u7684\u4eba\u53e3\u53ca\u6700\u53d1\u8fbe\u7684\u7ecf\u6d4e\uff0c\u4f46\u662f\u57281976\u5e74\u8499\u7279\u5a41\u5967\u904b\u6703\u540e\u88ab\u5b89\u5927\u7565\u7701\u7684\u591a\u4f26\u591a\u8d85\u8fc7\u3002\u4eca\u5929\u8499\u7279\u5229\u5c14\u4ecd\u7136\u662f\u52a0\u62ff\u5927\u6700\u91cd\u8981\u7684\u7ecf\u6d4e\u4e2d\u5fc3\u4e4b\u4e00\uff0c\u4eba\u5de5\u667a\u6167\u3001\u822a\u7a7a\u5de5\u4e1a\u3001\u91d1\u878d\u3001\u8bbe\u8ba1\u3001\u7535\u5f71\u5de5\u4e1a\u7b49\u884c\u4e1a\u53d1\u8fbe\u3002\u8499\u7279\u5a41\u88ab\u8ba4\u4e3a\u662f\u4e16\u754c\u6700\u4f73\u5b9c\u5c45\u57ce\u5e02\uff0c\u5e76\u88ab\u8054\u5408\u56fd\u6559\u80b2\u3001\u79d1\u5b66\u53ca\u6587\u5316\u7ec4\u7ec7\u8ba4\u5b9a\u4e3a\u8bbe\u8ba1\u4e4b\u57ce\u30021999\u5e74\u7b2c35\u5c46\u570b\u969b\u6280\u80fd\u7af6\u8cfd\u5728\u9019\u88e1\u8209\u884c\u3002\n", "\n", "title:\u8499\u7279\u5229\u5c14\n", "\n", "url:https://zh.wikipedia.org/wiki?curid=43791\n", "\n", "views:1000\n", "\n", "\n"]}], "source": ["print_result(dense_retrieval_results)"]}, {"cell_type": "markdown", "id": "88200092", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u67e5\u770b\u68c0\u7d22\u7ed3\u679c\uff1a\n", "\n", "\u7ed3\u679c\u4e2d\u7b2c\u4e8c\u4e2a\u662f\u6b63\u786e\u7684\uff0c\u662f\u6e25\u592a\u534e\u3002\u6709\u4e00\u4e9b\u4e0d\u518d\u662f\u6b63\u786e\u7b54\u6848\u7684\u7ed3\u679c\u3002\u591a\u4f26\u591a\u4e0d\u662f\u52a0\u62ff\u5927\u7684\u9996\u90fd\u3002\u7136\u540e\uff0c\u6211\u4eec\u8fd8\u6709\u9b41\u5317\u514b\u5e02\uff0c\u8fd9\u662f\u9519\u8bef\u7684\u7b54\u6848\u3002\u4e3a\u4ec0\u4e48\u4f1a\u53d1\u751f\u8fd9\u79cd\u60c5\u51b5\u5462\uff1f\n", "\n", "\u901a\u8fc7\u4e00\u4e2a\u5c0f\u4f8b\u5b50\u6765\u5e2e\u52a9\u7406\u89e3\u8fd9\u4e2a\u6982\u5ff5\u3002\u867d\u7136\u548c\u5f53\u524d\u7684\u641c\u7d22\u7ed3\u679c\u6709\u70b9\u4e0d\u540c\uff0c\u4f46\u6709\u52a9\u4e8e\u6211\u4eec\u7406\u89e3\u8fd9\u4e2a\u60c5\u51b5\u3002\n", "\n", "\u5047\u8bbe\u67e5\u8be2\u7684\u95ee\u9898\u662f\u201c\u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u4ec0\u4e48\uff1f\u201d\uff0c\u53ef\u80fd\u7684\u56de\u7b54\u6709\u4ee5\u4e0b\u4e94\u4e2a\uff1a\n", "\n", "- \u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u6e25\u592a\u534e\uff1a\u8fd9\u662f\u6b63\u786e\u7684\u3002\n", "\n", "- \u591a\u4f26\u591a\u4f4d\u4e8e\u52a0\u62ff\u5927\uff1a\u8fd9\u4e5f\u662f\u6b63\u786e\u7684\uff0c\u4f46\u4e0e\u95ee\u9898\u65e0\u5173\u3002\n", "\n", "- \u6cd5\u56fd\u7684\u9996\u90fd\u662f\u5df4\u9ece\uff1a\u8fd9\u4e5f\u662f\u6b63\u786e\u7684\uff0c\u4f46\u4e0d\u662f\u95ee\u9898\u7684\u7b54\u6848\u3002\n", "\n", "- \u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u6089\u5c3c\uff1a\u8fd9\u662f\u4e0d\u6b63\u786e\u7684\u3002\n", "\n", "- \u5b89\u5927\u7565\u7684\u7701\u4f1a\u662f\u591a\u4f26\u591a\uff1a\u8fd9\u662f\u6b63\u786e\u7684\uff0c\u4f46\u540c\u6837\u672a\u80fd\u56de\u7b54\u95ee\u9898\u3002\n", "\n", "\n", "![Dense Retrieval is also not perfect](images/5-1.png)"]}, {"cell_type": "markdown", "id": "4200b1f2", "metadata": {}, "source": ["\u8fdb\u884c\u7a20\u5bc6\u68c0\u7d22\u65f6\u4f1a\u53d1\u751f\u4ec0\u4e48\u5462\uff1f\n", "\n", "\u6211\u4eec\u5047\u8bbe\u4e94\u4e2a\u53e5\u5b50\u5728 embedding \u7a7a\u95f4\u7684\u5206\u5e03\u5982\u56fe\u6240\u793a\u3002\u7a20\u5bc6\u68c0\u7d22\u7684\u539f\u7406\u662f\u5c06\u67e5\u8be2 \u751f\u6210 embedding\uff0c\u7136\u540e\u8fd4\u56de\u4e0e\u4e4b\u6700\u63a5\u8fd1\u7684\u5185\u5bb9\uff0c\u5373\u201c\u5b89\u5927\u7565\u7684\u9996\u90fd\u662f\u591a\u4f26\u591a\u201d\u3002\u7a20\u5bc6\u68c0\u7d22\u770b\u91cd**\u8bed\u4e49\u76f8\u4f3c\u6027**\uff0c\u56e0\u6b64\u5b83\u8fd4\u56de\u4e0e\u95ee\u9898\u6700\u76f8\u4f3c\u7684\u5185\u5bb9\u3002\u4f46\u8fd9\u53ef\u80fd\u4e0d\u662f\u6b63\u786e\u7684\u7b54\u6848\uff0c\u751a\u81f3\u53ef\u80fd\u4e0d\u662f\u771f\u5b9e\u7684\u9648\u8ff0\uff0c\u5b83\u53ea\u662f\u4e00\u4e2a\u5728\u8bed\u4e49\u4e0a\u4e0e\u95ee\u9898\u63a5\u8fd1\u7684\u53e5\u5b50\u3002\u56e0\u6b64\uff0c\u7a20\u5bc6\u68c0\u7d22\u6709\u53ef\u80fd\u8fd4\u56de\u7684\u5e76\u975e\u7b54\u6848\u3002\u6211\u4eec\u5982\u4f55\u4fee\u590d\u8fd9\u4e2a\u95ee\u9898\u5462\uff1f\u8fd9\u5c31\u662f\u91cd\u6392\u8d77\u4f5c\u7528\u7684\u5730\u65b9\u3002"]}, {"cell_type": "markdown", "id": "acdc556d", "metadata": {}, "source": ["![rerank](images/5-2.png)"]}, {"cell_type": "markdown", "id": "ff2ff78a", "metadata": {}, "source": ["\u5047\u8bbe\u67e5\u8be2\u662f\u201c\u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u4ec0\u4e48\u201d\uff0c\u6b64\u65f6\u670910\u4e2a\u53ef\u80fd\u7684\u7b54\u6848\uff0c\u5176\u4e2d\u4e00\u4e9b\u4e0e\u95ee\u9898\u76f8\u5173\uff0c\u800c\u53e6\u4e00\u4e9b\u5219\u4e0d\u76f8\u5173\u3002\u56e0\u6b64\uff0c\u5f53\u6211\u4eec\u4f7f\u7528\u7a20\u5bc6\u68c0\u7d22\u65f6\uff0c\u5b83\u4f1a\u7ed9\u6211\u4eec\u4e0e\u67e5\u8be2\u6700\u76f8\u4f3c\u7684\u4e94\u4e2a\u7b54\u6848\uff0c\u4e5f\u5c31\u662f\u4e0e\u67e5\u8be2\u6700\u76f8\u4f3c\u7684\u4e94\u4e2a\u5185\u5bb9\u3002\u5047\u8bbe\u8fd4\u56de\u5185\u5bb9\u5c31\u662f\u7eff\u8272\u7684\u8fd9\u4e9b\u53e5\u5b50\u3002\u73b0\u5728\u6211\u4eec\u6709\u4e94\u4e2a\u4e0e\u67e5\u8be2\u975e\u5e38\u63a5\u8fd1\u7684\u53e5\u5b50\uff0c\u4f46\u6211\u4eec\u4e0d\u77e5\u9053\u54ea\u4e00\u4e2a\u624d\u662f\u6b63\u786e\u7b54\u6848\u3002\u8fd9\u5c31\u662f Rerank \u53d1\u6325\u4f5c\u7528\u7684\u5730\u65b9\u3002\n", "\n", "\u91cd\u6392\u6a21\u578b\u4e3a\u6bcf\u4e2a\u67e5\u8be2\u7ed3\u679c\u5bf9\u6253\u4e00\u4e2a\u76f8\u5173\u5f97\u5206\uff0c\u544a\u8bc9\u60a8\u7b54\u6848\u76f8\u5bf9\u4e8e\u67e5\u8be2\u7684\u76f8\u5173\u7a0b\u5ea6\u3002\u8fd9 5 \u4e2a\u53e5\u5b50\u6700\u9ad8\u76f8\u5173\u6027\u4e3a 0.9\uff0c\u5bf9\u5e94\u4e8e\u201c\u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u6e25\u592a\u534e\u201d\uff0c\u8fd9\u5c31\u662f\u6b63\u786e\u7684\u7b54\u6848\u3002\u8fd9\u5c31\u662f\u91cd\u6392\u7684\u4f5c\u7528\u3002"]}, {"cell_type": "markdown", "id": "f8f5cbaf", "metadata": {}, "source": ["### 2.2 \u91cd\u6392\u6a21\u578b\u7684\u8bad\u7ec3\u65b9\u5f0f"]}, {"cell_type": "markdown", "id": "68c76221", "metadata": {}, "source": ["![rerank_training](images/5-3.png)"]}, {"cell_type": "markdown", "id": "cc62573b", "metadata": {}, "source": ["\u91cd\u6392\u6a21\u578b\u7684\u8bad\u7ec3\u9700\u8981\u5927\u91cf\u7684\u9ad8\u8d28\u91cf\u6837\u672c\uff0c\u8fd9\u4e9b\u6837\u672c\u5305\u62ec\u4e0e\u67e5\u8be2\u9ad8\u5ea6\u76f8\u5173\u7684\u54cd\u5e94\u6216\u6587\u6863\u3002\u8bad\u7ec3\u7684\u76ee\u6807\u662f\u4f7f\u6a21\u578b\u80fd\u591f\u7ed9\u51fa\u9ad8\u76f8\u5173\u6027\u7684\u5f97\u5206\u3002\u540c\u65f6\uff0c\u6211\u4eec\u4e5f\u9700\u8981\u63d0\u4f9b\u4e00\u4e9b\u9519\u8bef\u7684\u67e5\u8be2\u54cd\u5e94\u4f5c\u4e3a\u6837\u672c\uff0c\u8fd9\u4e9b\u54cd\u5e94\u53ef\u80fd\u4e0e\u67e5\u8be2\u4e0d\u5b8c\u5168\u5339\u914d\uff0c\u53ef\u80fd\u662f\u63a5\u8fd1\u4f46\u4e0d\u7b26\u5408\u7684\u60c5\u51b5\uff0c\u6216\u8005\u662f\u4e00\u4e2a\u53ef\u80fd\u4e0e\u67e5\u8be2\u4e0d\u5339\u914d\u7684\u6587\u6863\u3002\u901a\u8fc7\u8bad\u7ec3\u6a21\u578b\u5bf9\u4f18\u8d28\u7684\u67e5\u8be2\u54cd\u5e94\u7ed9\u51fa\u9ad8\u5206\uff0c\u5bf9\u4e0d\u7406\u60f3\u7684\u67e5\u8be2\u54cd\u5e94\u7ed9\u51fa\u4f4e\u5206\uff0c\u4ee5\u6b64\u83b7\u5f97\u4e00\u4e2a\u80fd\u591f\u5206\u914d\u76f8\u5173\u6027\u7684\u91cd\u6392\u6a21\u578b\u3002\u5f53\u67e5\u8be2\u548c\u54cd\u5e94\u9ad8\u5ea6\u76f8\u5173\u65f6\uff0c\u8be5\u6a21\u578b\u5c06\u7ed9\u51fa\u9ad8\u5f97\u5206\u3002"]}, {"cell_type": "markdown", "id": "db449134", "metadata": {}, "source": ["## \u4e09\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u5173\u952e\u8bcd\u68c0\u7d22"]}, {"cell_type": "markdown", "id": "e15d7130", "metadata": {}, "source": ["\u6211\u4eec\u5c06\u5bfc\u5165\u4e4b\u524d\u5728\u7b2c\u4e00\u8bfe\u4e2d\u4f7f\u7528\u7684\u5173\u952e\u8bcd\u68c0\u7d22\u51fd\u6570\u3002\u518d\u6b21\u95ee\u5b83\uff0c\u201c\u52a0\u62ff\u5927\u7684\u9996\u90fd\u662f\u4ec0\u4e48\u201d"]}, {"cell_type": "code", "execution_count": 12, "id": "8071c68a-6dec-47f9-b5e1-473f9acdc83f", "metadata": {}, "outputs": [], "source": ["from utils import keyword_search"]}, {"cell_type": "code", "execution_count": 27, "id": "e851efa5-10c7-4f98-85f1-2a1c565d9723", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "Monarchy of Canada\n", "In his 1990 book, \"Continental Divide: the Values and Institutions of the United States and Canada,\" Seymour Martin Lipset argues that the presence of the monarchy in Canada helps distinguish Canadian identity from American identity. Since at least the 1930s, supporters of the Crown have held the opinion that the Canadian monarch is also one of the rare unified elements of Canadian society, focusing both \"the historic consciousness of the nation\" and various forms of patriotism and national love \"[on] the point around which coheres the nation's sense of a continuing personality\". Former Governor General Vincent Massey articulated in 1967 that the monarchy \"is part of ourselves. It is linked in a very special way with our national life. It stands for qualities and institutions which mean Canada to every one of us and which for all our differences and all our variety have kept Canada Canadian.\" But, according to Arthur Bousfield and Gary Toffoli, Canadians were, through the late 1960s to the 2000s, encouraged by the federal government to \"neglect, ignore, forget, reject, debase, suppress, even hate, and certainly treat as foreign what their parents and grandparents, whether spiritual or blood, regarded as the basis of Canadian nationhood, autonomy, and history\", including the monarchy. Former Governor General Roland Michener said in 1970 that anti-monarchists claimed the Canadian Crown is foreign and incompatible with Canada's multicultural society, which the government promoted as a Canadian identifier, and Lawrence Martin called in 2007 for Canada to become a republic in order to \"re-brand the nation\". However, Michener also stated, \"[the monarchy] is our own by inheritance and choice, and contributes much to our distinctive Canadian identity and our chances of independent survival amongst the republics of North and South America.\" Journalist Christina Blizzard emphasized in 2009 that the monarchy \"made [Canada] a haven of peace and justice for immigrants from around the world\", while Michael Valpy contended in 2009 that the Crown's nature permitted non-conformity amongst its subjects, thereby opening the door to multiculturalism and pluralism.\n", "i:1\n", "Early modern period\n", "North America outside the zone of Spanish settlement was a contested area in the 17th century. Spain had founded small settlements in Florida and Georgia but nowhere near the size of those in New Spain or the Caribbean islands. France, The Netherlands, and Great Britain held several colonies in North America and the West Indies from the 17th century, 100 years after the Spanish and Portuguese established permanent colonies. The British colonies in North America were founded between 1607 (Virginia) and 1733 (Georgia). The Dutch explored the east coast of North America and began founding settlements in what they called New Netherland (now New York State.). France colonized what is now Eastern Canada, founding Quebec City in 1608. France's loss in the Seven Years' War resulted in the transfer of New France to Great Britain. The Thirteen Colonies, in lower British North America, rebelled against British rule in 1775, largely due to the taxation that Great Britain was imposing on the colonies. The British colonies in Canada remained loyal to the crown, and a provisional government formed by the Thirteen Colonies proclaimed their independence on July 4, 1776 and subsequently became the original 13 United States of America. With the 1783 Treaty of Paris ending the American Revolutionary War, Britain recognised the former Thirteen Colonies' independence.\n", "i:2\n", "Flag of Canada\n", "By the Second World War, the Red Ensign was viewed as Canada's \"de facto\" national flag. A joint committee of the Senate and House of Commons was appointed on November 8, 1945, to recommend a national flag to officially adopt. It received 2,409 designs from the public and was addressed by the director of the Historical Section of the Canadian Army, Fortescue Duguid, who pointed out that red and white were Canada's official colours and there was already an emblem representing the country: three joined maple leaves seen on the escutcheon of the Canadian coat of arms. By May 9 the following year, the committee reported back with a recommendation \"that the national flag of Canada should be the Canadian red ensign with a maple leaf in autumn golden colours in a bordered background of white\". The Legislative Assembly of Quebec had urged the committee to not include any of what it deemed as \"foreign symbols\", including the Union Flag, and Mackenzie King, then still prime minister, declined to act on the report; fearing it may lead to political instability. As a result, the Union Flag was kept as a national flag, and the order to fly the Canadian Red Ensign at government buildings was maintained.\n"]}], "source": ["query_1 = \"What is the capital of Canada?\"\n", "results = keyword_search(query_1,\n", " client,\n", " properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n", " num_results=3\n", " )\n", "\n", "for i, result in enumerate(results):\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " print(result.get('text'))"]}, {"cell_type": "code", "execution_count": 169, "id": "b1b88822", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "\u9996\u90fd\n", "\u6bcf\u4e00\u500b\u570b\u5bb6\u901a\u5e38\u53ea\u8a2d\u7acb\u4e00\u500b\u9996\u90fd\uff0c\u56e0\u70ba\u653f\u5e9c\u901a\u5e38\u6703\u5c07\u5176\u91cd\u8981\u6a5f\u95dc\u96c6\u4e2d\u5728\u9996\u90fd\u5730\u5340\uff0c\u4ee5\u65b9\u4fbf\u653f\u5e9c\u9ad8\u5c64\u884c\u653f\u548c\u7ba1\u7406\uff0c\u4f46\u4ea6\u6709\u4f8b\u5916\u3002\u4e00\u4e9b\u570b\u5bb6\u6709\u591a\u500b\u9996\u90fd\uff0c\u4e00\u4e9b\u751a\u81f3\u6c92\u6709\u3002\u6709\u6642\u5019\uff0c\u5be6\u969b\u7684\u9996\u90fd\u548c\u6cd5\u5b9a\u7684\u9996\u90fd\u7531\u65bc\u67d0\u4e9b\u539f\u56e0\u4e26\u4e0d\u5728\u540c\u4e00\u500b\u57ce\u5e02\u3002\u8b6c\u5982\uff0c\u4e00\u500b\u7a31\u70ba\u300c\u9996\u90fd\u300d\u7684\u57ce\u5e02\uff0c\u5be6\u969b\u4e0a\u4e26\u975e\u4e2d\u592e\u653f\u5e9c\u6240\u5728\u5730\u3002\u53cd\u4e4b\uff0c\u6240\u8b02\u7684\u6b63\u5f0f\u300c\u9996\u90fd\u300d\u96d6\u7136\u662f\u4e2d\u592e\u653f\u5e9c\u7684\u6240\u5728\u5730\uff0c\u4f46\u53ef\u80fd\u4e0d\u662f\u653f\u6cbb\u6c7a\u7b56\u7684\u5730\u7406\u4e2d\u5fc3\u3002\u6545\u6b64\uff0c\u300c\u884c\u653f\u9996\u90fd\u300d\u4e00\u822c\u88ab\u8a8d\u5b9a\u70ba\u662f\u8a72\u570b\u7684\u300c\u570b\u5bb6\u9996\u90fd\u300d\u3002\n", "i:1\n", "\u5e7b\u60f3\u6230\u8a18\n", "\u904a\u6232\u4e2d\u5b58\u5728\u516d\u500b\u5927\u9678\uff0c\u5404\u500b\u5927\u9678\u7684\u5730\u5716\u4e4b\u9593\u6c92\u6709\u7269\u7406\u9023\u63a5\uff0c\u5730\u5716\u8207\u5730\u5716\u4e4b\u9593\u7684\u79fb\u52d5\u65b9\u5f0f\u70ba\u9ede\u9078\u5404\u500b\u5927\u9678\u4e0a\u7a31\u70ba\u300c\u6230\u5834\u300d\u6216\u8005\u300c\u9996\u90fd\u300d\u7684\u64da\u9ede\u3002\u5176\u4e2d\u5404\u500b\u300c\u6230\u5834\u300d\u662f\u53ef\u4ee5\u5ba3\u6230\u7684\u5730\u5716\uff0c\u800c\u300c\u9996\u90fd\u300d\u5247\u4e0d\u80fd\u88ab\u9032\u653b\uff08\u4e5f\u5c31\u662f\u8aaa\u5c31\u7b97\u4efb\u4f55\u4e00\u500b\u570b\u5bb6\u7684\u672c\u571f\u88ab\u4fb5\u4f54\u5b8c\u7562\u8a72\u570b\u4e5f\u4e0d\u6703\u6ec5\u570b\uff09\u3002\n", "i:2\n", "\u9996\u90fd\n", "\u9996\u90fd\uff0c\u4f5c\u70ba\u570b\u5bb6\u653f\u6cbb\u3001\u7d93\u6fdf\u3001\u6587\u5316\u7684\u6703\u805a\u4e26\u4e0d\u662f\u6c38\u6046\u4e0d\u8b8a\u7684\u3002\u5728\u53e4\u4ee3\uff0c\u570b\u5bb6\u4e00\u822c\u63a1\u53d6\u4e2d\u592e\u96c6\u6b0a\u653f\u7b56\uff0c\u5730\u65b9\u52e2\u529b\u6709\u9650\uff1b\u9996\u90fd\u4e00\u65e6\u6dea\u9677\uff0c\u5c31\u610f\u5473\u8457\u671d\u4ee3\u7684\u8986\u4ea1\u3002\u4e2d\u570b\u4e09\u570b\u6642\u4ee3\uff0c\u8700\u6f22\u3001\u5433\u56e0\u5931\u53bb\u5404\u81ea\u7684\u9996\u90fd\u2500\u2500\u6210\u90fd\u548c\u5efa\u696d\uff08\u4eca\u5357\u4eac\uff09\u800c\u4ea1\u570b\u3002\n"]}], "source": ["query_2_zh = \"\u52a0\u62ff\u5927 \u9996\u90fd\"\n", "results_zh = keyword_search(query_2_zh,\n", " client,\n", " results_lang='zh',\n", " properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n", " num_results=3\n", " )\n", "\n", "for i, result in enumerate(results_zh):\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " print(result.get('text'))"]}, {"cell_type": "markdown", "id": "cfcd3ed4", "metadata": {}, "source": ["- \u82f1\u6587\u7b54\u6848\u8f93\u51fa\u7684\u524d\u4e09\u4e2a\u7b54\u6848\u5e76\u4e0d\u7406\u60f3\u3002\u5b83\u4eec\u6d89\u53ca\u52a0\u62ff\u5927\u7684\u541b\u4e3b\u5236\u3001\u65e9\u671f\u73b0\u4ee3\u65f6\u671f\u548c\u52a0\u62ff\u5927\u56fd\u65d7\u3002\n", "- \u4e2d\u6587\u7b54\u6848\u8f93\u51fa\u7684\u524d\u4e09\u4e2a\u7b54\u6848\u76f8\u5173\u6027\u66f4\u5dee\u3002\u5b83\u4eec\u53ea\u8003\u8651\u4e86\u9996\u90fd\uff0c\u6ca1\u6709\u5173\u4e8e\u52a0\u62ff\u5927\u7684\u4fe1\u606f\u3002\n", "\n", "\u4e3a\u4ec0\u4e48\u5b83\u4eec\u4f1a\u8fd9\u6837\u5462\uff1f\u56e0\u4e3a\u5173\u952e\u8bcd\u68c0\u7d22\u4ec5\u4ec5\u662f\u5728\u67e5\u627e\u4e0e\u67e5\u8be2\u6709\u8bb8\u591a\u5171\u540c\u5355\u8bcd\u7684\u6587\u6863\uff0c\u4f46\u65e0\u6cd5\u771f\u6b63\u5224\u65ad\u51fa\u662f\u5426\u8fd9\u4e9b\u6587\u6863\u786e\u5b9e\u5728\u56de\u7b54\u95ee\u9898\u3002\u6240\u6709\u8fd9\u4e9b\u6587\u7ae0\u90fd\u5305\u542b\u4e0e\u67e5\u8be2\u6709\u5f88\u591a\u5171\u540c\u5355\u8bcd\uff0c\u4f46\u5b83\u4eec\u5e76\u975e\u7b54\u6848\u3002"]}, {"cell_type": "markdown", "id": "7268065b", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u6269\u5927\u4e0b\u68c0\u7d22\u89c4\u6a21\uff0c\u8981\u6c42\u5b83\u8fd4\u56de 500 \u4e2a\u7ed3\u679c\u3002\u4e3a\u4e86\u4fbf\u4e8e\u89c2\u6d4b\uff0c\u8fd9\u91cc\u4e0d\u6253\u5370\u6587\u672c\uff0c\u53ea\u6253\u5370\u6807\u9898\u3002"]}, {"cell_type": "code", "execution_count": 13, "id": "6e1b2d2c", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "Monarchy of Canada\n", "i:1\n", "Early modern period\n", "i:2\n", "Flag of Canada\n", "i:3\n", "Flag of Canada\n", "i:4\n", "Prime Minister of Canada\n", "i:5\n", "Hamilton, Ontario\n", "i:6\n", "Liberal Party of Canada\n", "i:7\n", "Stephen Harper\n", "i:8\n", "Monarchy of Canada\n", "i:9\n", "Flag of Canada\n"]}], "source": ["query_1 = \"What is the capital of Canada?\"\n", "results = keyword_search(query_1,\n", " client,\n", " properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n", " num_results=500\n", " )\n", "\n", "for i, result in enumerate(results[:10]): # \u60a8\u53ef\u4ee5\u81ea\u884c\u8c03\u6574\u8f93\u51fa\u7684\u6807\u9898\u6570\u91cf\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " #print(result.get('text'))"]}, {"cell_type": "code", "execution_count": 14, "id": "74dd95e6", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "\u9996\u90fd\n", "i:1\n", "\u5e7b\u60f3\u6230\u8a18\n", "i:2\n", "\u9996\u90fd\n", "i:3\n", "\u9996\u5e9c\n", "i:4\n", "\u9996\u90fd (\u9999\u6e2f)\n", "i:5\n", "\u4e2d\u83ef\u6c11\u570b\u9996\u90fd\n", "i:6\n", "\u897f\u5b89\u5e02\n", "i:7\n", "\u9996\u90fd\n", "i:8\n", "\u9996\u90fd\n", "i:9\n", "\u9996\u90fd\n"]}], "source": ["query_1_zh = \"\u52a0\u62ff\u5927 \u9996\u90fd\"\n", "results_zh = keyword_search(query_1_zh,\n", " client,\n", " results_lang='zh',\n", " properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n", " num_results=500\n", " )\n", "\n", "for i, result in enumerate(results_zh[:10]): # \u60a8\u53ef\u4ee5\u81ea\u884c\u8c03\u6574\u8f93\u51fa\u7684\u6807\u9898\u6570\u91cf\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " #print(result.get('text'))"]}, {"cell_type": "markdown", "id": "faa3ed9e", "metadata": {}, "source": ["\u8fd9\u91cc\u6709\u6253\u5206\u6700\u9ad8\u7684\u524d 500\u4e2a\u7ed3\u679c\u3002\u6211\u4eec\u5982\u4f55\u624d\u80fd\u786e\u5b9a\u8fd9\u4e9b\u7ed3\u679c\u4e2d\u662f\u5426\u5305\u542b\u7b54\u6848\u5462\uff1f\u8fd9\u5c31\u662f\u91cd\u6392\u7684\u4f5c\u7528\u6240\u5728\u3002\u4e0b\u9762\u8fd9\u4e2a\u51fd\u6570\u5bf9\u54cd\u5e94\u8fdb\u884c\u91cd\u6392\uff0c\u5e76\u8f93\u51fa\u6253\u5206\u6700\u9ad8\u7684 10 \u4e2a\u3002"]}, {"cell_type": "code", "execution_count": 15, "id": "b38761f8-32b1-4b44-be97-0884894cf6b3", "metadata": {}, "outputs": [], "source": ["import cohere\n", "def rerank_responses(query, responses, num_responses=10, results_lang='en'):\n", " \"\"\"\n", " \u6839\u636e\u7ed9\u5b9a\u7684\u67e5\u8be2\uff0c\u4f7f\u7528\u6307\u5b9a\u7684\u6a21\u578b\u5bf9\u54cd\u5e94\u5217\u8868\u8fdb\u884c\u91cd\u6392\u5e8f\u3002\n", "\n", " Args:\n", " query (str): \u67e5\u8be2\u3002\n", " responses (list): \u54cd\u5e94\u7684\u5217\u8868\u3002\n", " num_responses (int, optional): \u8fd4\u56de\u7684\u54cd\u5e94\u6570\u91cf\uff0c\u9ed8\u8ba4\u4e3a10\u3002\n", " results_lang (str, optional): \u6307\u5b9a\u7684\u8bed\u8a00\u6a21\u578b\u7248\u672c\uff0c\u9ed8\u8ba4\u4e3a\u82f1\u6587\uff08\u5b98\u65b9\u53ea\u63d0\u4f9b\u82f1\u6587\u548c\u591a\u8bed\u8a00\u4e24\u4e2a\u7248\u672c\uff09\u3002\n", "\n", " Returns:\n", " list: \u91cd\u6392\u5e8f\u540e\u7684\u54cd\u5e94\u5217\u8868\u3002\n", " \"\"\"\n", " \n", " model_name = 'rerank-english-v2.0' if results_lang=='en' else 'rerank-multilingual-v2.0'\n", " \n", " reranked_responses = co.rerank(\n", " model=model_name,\n", " query=query,\n", " documents=responses,\n", " top_n=num_responses,\n", " )\n", " return reranked_responses"]}, {"cell_type": "markdown", "id": "2d84b454", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u8ba9\u6211\u4eec\u5c06\u7b54\u6848\u7684\u6587\u672c\u4e0a\u8fdb\u884c\u91cd\u6392\u3002"]}, {"cell_type": "code", "execution_count": null, "id": "02d3e55c-0a5b-4b3a-9a59-3f7164927dc0", "metadata": {}, "outputs": [], "source": ["texts = [result.get('text') for result in results] # \u53ea\u63d0\u53d6\u7ed3\u679c\u4e2d\u7684\u6587\u672c\n", "reranked_text = rerank_responses(query_1, texts)"]}, {"cell_type": "code", "execution_count": 32, "id": "6b3a380b-cebf-47da-956d-dc62dc53e5a0", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "RerankResult\n", "\n", "i:1\n", "RerankResult\n", "\n", "i:2\n", "RerankResult\n", "\n", "i:3\n", "RerankResult\n", "\n", "i:4\n", "RerankResult\n", "\n", "i:5\n", "RerankResult\n", "\n", "i:6\n", "RerankResult\n", "\n", "i:7\n", "RerankResult\n", "\n", "i:8\n", "RerankResult\n", "\n", "i:9\n", "RerankResult\n", "\n"]}], "source": ["for i, rerank_result in enumerate(reranked_text):\n", " print(f\"i:{i}\")\n", " print(f\"{rerank_result}\")\n", " print()"]}, {"cell_type": "code", "execution_count": 172, "id": "9bb71208", "metadata": {}, "outputs": [], "source": ["texts_zh = [result.get('text') for result in results_zh]\n", "reranked_text_zh = rerank_responses_zh(query_1_zh, texts_zh)"]}, {"cell_type": "code", "execution_count": 173, "id": "b483507b", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "RerankResult\n", "\n", "i:1\n", "RerankResult\n", "\n", "i:2\n", "RerankResult\n", "\n", "i:3\n", "RerankResult\n", "\n", "i:4\n", "RerankResult\n", "\n", "i:5\n", "RerankResult\n", "\n", "i:6\n", "RerankResult\n", "\n", "i:7\n", "RerankResult\n", "\n", "i:8\n", "RerankResult\n", "\n", "i:9\n", "RerankResult\n", "\n"]}], "source": ["for i, rerank_result in enumerate(reranked_text_zh):\n", " print(f\"i:{i}\")\n", " print(f\"{rerank_result}\")\n", " print()"]}, {"cell_type": "markdown", "id": "689dd251", "metadata": {}, "source": ["\u5728\u8f93\u5165\u67e5\u8be2\u548c\u7ed3\u679c\u4e4b\u540e\uff0c\u8ba9\u6211\u4eec\u6253\u5370\u51fa\u91cd\u6392\u7684\u524d 10 \u4e2a\u7ed3\u679c\u3002"]}, {"cell_type": "markdown", "id": "bd5e9b15", "metadata": {}, "source": ["\u8bf7\u6ce8\u610f\uff0c\u5176\u4e2d\u83b7\u5f97\u4e86\u6b63\u786e\u7b54\u6848\u3002\u5b83\u786e\u5b9a\u6e25\u592a\u534e\u4f5c\u4e3a\u52a0\u62ff\u5927\u7684\u9996\u90fd\uff0c\u5e76\u4e14\u76f8\u5173\u5206\u6570\u975e\u5e38\u9ad8\uff0c\u63a5\u8fd1 1\uff0c\u8fbe\u5230 0.98\u3002\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0c\u6392\u540d\u7b2c\u4e8c\u7684\u6587\u7ae0\u4e5f\u76f8\u5f53\u4e0d\u9519\uff0c\u5b83\u6d89\u53ca\u52a0\u62ff\u5927\u5386\u53f2\u4e0a\u4e0d\u540c\u7684\u9996\u90fd\uff0c\u5176\u76f8\u5173\u5206\u6570\u4e3a 0.97\u3002\u7b2c\u4e09\u4e2a\u4e5f\u5f88\u51fa\u8272\u3002\u91cd\u6392\u4ece\u5173\u952e\u8bcd\u68c0\u7d22\u51fa\u7684 10 \u4e2a\u7b54\u6848\u4e2d\u6311\u9009\u51fa\u76f8\u5173\u6027\u6700\u9ad8\u7684 10\u3002"]}, {"cell_type": "markdown", "id": "f6cbb081", "metadata": {}, "source": ["## \u56db\u3001\u4f7f\u7528\u91cd\u6392\u6539\u8fdb\u7a20\u5bc6\u68c0\u7d22"]}, {"cell_type": "markdown", "id": "9e5ae7b5", "metadata": {}, "source": ["### 4.1 \u8fdb\u4e00\u6b65\u7406\u89e3\u91cd\u6392"]}, {"cell_type": "markdown", "id": "9d9c0d16", "metadata": {}, "source": ["\u6211\u5c06\u518d\u6b21\u4f7f\u7528\u7a20\u5bc6\u68c0\u7d22\u51fd\u6570\uff0c\u5c1d\u8bd5\u89e3\u51b3\u4e00\u4e2a\u7a0d\u5fae\u56f0\u96be\u7684\u95ee\u9898\u3002\u6211\u4eec\u8be2\u95ee\uff1a\"\u8c01\u662f\u5386\u53f2\u4e0a\u6700\u9ad8\u7684\u4eba\uff1f\" \u5bf9\u4e8e\u5173\u952e\u8bcd\u68c0\u7d22\u6765\u8bf4\uff0c\u8fd9\u5c06\u662f\u4e00\u4e2a\u6709\u6311\u6218\u6027\u7684\u95ee\u9898\uff0c\u56e0\u4e3a\u5b83\u66f4\u5173\u6ce8\u5305\u542b\u201c\u5386\u53f2\u201d\u6216\u201c\u4eba\u7269\u201d\u7684\u6587\u7ae0\uff0c\u5e76\u4e0d\u80fd\u6355\u6349\u5230\u95ee\u9898\u7684\u771f\u5b9e\u610f\u4e49\u3002\u6211\u4eec\u5e0c\u671b\u7a20\u5bc6\u68c0\u7d22\u53ef\u4ee5\u505a\u5f97\u66f4\u597d\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u5c06\u8c03\u7528\u7a20\u5bc6\u68c0\u7d22\u51fd\u6570\u6765\u83b7\u53d6\u66f4\u51c6\u786e\u7684\u7ed3\u679c\u3002"]}, {"cell_type": "code", "execution_count": 110, "id": "be2e5378-ea37-4726-b3c3-5875d46759e7", "metadata": {}, "outputs": [], "source": ["from utils import dense_retrieval"]}, {"cell_type": "code", "execution_count": 111, "id": "d5af11ea-6c30-4303-8c9e-8a5510e046bb", "metadata": {}, "outputs": [], "source": ["query_2 = \"Who is the tallest person in history?\""]}, {"cell_type": "code", "execution_count": 112, "id": "4da5c744-01b8-4780-a615-0a5edf9bfbd6", "metadata": {}, "outputs": [], "source": ["results = dense_retrieval(query_2, client)"]}, {"cell_type": "code", "execution_count": 113, "id": "9e4540d8-ed5e-4f97-8802-6d39a52b8964", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "Robert Wadlow\n", "Robert Pershing Wadlow (February 22, 1918 July 15, 1940), also known as the Alton Giant and the Giant of Illinois, was a man who was the tallest person in recorded history for whom there is irrefutable evidence. He was born and raised in Alton, Illinois, a small city near St. Louis, Missouri.\n", "\n", "i:1\n", "Manute Bol\n", "Bol came from a family of extraordinarily tall men and women. He said: \"My mother was , my father , and my sister is . And my great-grandfather was even taller\u2014.\" His ethnic group, the Dinka, and the Nilotic people of which they are a part, are among the tallest populations in the world. Bol's hometown, Turalei, is the origin of other exceptionally tall people, including basketball player Ring Ayuel. \"I was born in a village, where you cannot measure yourself,\" Bol reflected. \"I learned I was 7 foot 7 in 1979, when I was grown. I was about 18 or 19.\"\n", "\n", "i:2\n", "Sultan K\u00f6sen\n", "Sultan K\u00f6sen (born 10 December 1982) is a Turkish farmer who holds the Guinness World Record for tallest living male at . Of Kurdish ethnicity, he is the seventh tallest man in history.\n", "\n", "i:3\n", "Sultan K\u00f6sen\n", "K\u00f6sen turned 40 years old on 10 December 2022. He celebrated his birthday a few days early by visiting the Ripley's Believe It or Not! museum in Orlando, Florida, USA and posing next to a life-sized statue of Robert Wadlow, the tallest man ever at 272 cm (8 ft 11.1 in).\n", "\n", "i:4\n", "Netherlands\n", "The Dutch are the tallest people in the world, by nationality, with an average height of for adult males and for adult females in 2009. The average height of young males in the Netherlands increased from 5 feet, 4 inches to approximately 6 feet between the 1850s until the early 2000s. People in the south are on average about shorter than those in the north.\n", "\n"]}], "source": ["for i, result in enumerate(results):\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " print(result.get('text'))\n", " print()"]}, {"cell_type": "code", "execution_count": 116, "id": "e940eec6", "metadata": {}, "outputs": [], "source": ["query_2_zh = \"\u5386\u53f2\u4e0a\u6700\u9ad8\u7684\u4eba?\""]}, {"cell_type": "code", "execution_count": 122, "id": "f493fcc8", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "\u9b91\u559c\u9806\n", "\u9c8d\u559c\u987a\u57282005\u5e74\u30012008\u5e74\u4e24\u5ea6\u7372\u5f97\u5409\u5c3c\u65af\u4e16\u754c\u7eaa\u5f55\u8a8d\u8b49\u88ab\u627f\u8a8d\u662f\u5730\u7403\u4e0a\u56e0\u81ea\u7136\u539f\u56e0\u800c\u9577\u5f97\u6700\u9ad8\u7684\u6d3b\u4eba\uff08\u5176\u4ed6\u64c1\u6709\u4e16\u754c\u4e4b\u6700\u982d\u929c\u7684\u9577\u4eba\u4e0d\u662f\u5df2\u901d\uff0c\u5982\u7f57\u4f2f\u7279\u00b7\u74e6\u5fb7\u7f57\uff0c\u5c31\u662f\u56e0\u7279\u6b8a\u75c5\u8b8a\u800c\u8fc7\u5ea6\u53d1\u80b2\u7a31\u70ba\u5de8\u4eba\u75c7\uff0c\u5982\u5217\u6602\u5c3c\u5fb7\u00b7\u65af\u5854\u5fb7\u5c3c\u514b\uff09\u3002\u5b9e\u9645\u4e0a\uff0c\u8fd9\u4e00\u7eaa\u5f55\u57282007\u5e74\u5373\u88ab\u4e4c\u514b\u5170\u4eba\u5217\u6602\u5c3c\u5fb7\u00b7\u65af\u5854\u5fb7\u5c3c\u514b\uff08Leonid Stadnyk\uff09\u53d6\u4ee3\uff0c\u4f46\u56e0\u5176\u540e\u6765\u62d2\u7edd\u63a5\u53d7\u5409\u5c3c\u65af\u4e16\u754c\u7eaa\u5f55\u7ec4\u7ec7\u6d4b\u91cf\u8eab\u9ad8\u800c\u5e76\u672a\u88ab\u8be5\u7ec4\u7ec7\u7ee7\u7eed\u8ba4\u53ef\uff0c\u6545\u9c8d\u559c\u987a\u91cd\u65b0\u6210\u4e3a\u4e16\u754c\u6700\u9ad8\u4eba\uff0c\u76f4\u81f32009\u5e74\u8be5\u7eaa\u5f55\u6301\u6709\u8005\u88ab\u571f\u8033\u5176\u4eba\u8607\u4e39\u00b7\u79d1\u585e\uff08Sultan K\u00f6sen\uff09\u53d6\u4ee3\u3002\n", "\n", "i:1\n", "\u8607\u4e39\u00b7\u79d1\u585e\n", "\u8607\u4e39\u00b7\u79d1\u585e\uff08\uff1b\uff09\uff0c\u51fa\u751f\u65bc\u571f\u8033\u5176\u9a6c\u5c14\u4e01\uff0c\u662f\u81ea2009\u5e74\u8d77\u88ab\u78ba\u8a8d\u70ba\u5168\u4e16\u754c\u6700\u9ad8\u7684\u4eba\uff0c\u88ab\u5217\u5165\u91d1\u6c0f\u4e16\u754c\u7d00\u9304\u5927\u5168\uff0c\u5176\u96d9\u624b\u548c\u8173\u638c\u4ea6\u6253\u7834\u91d1\u6c0f\u4e16\u754c\u7d00\u9304\u5927\u5168\uff0c\u8173\u638c\u9577\u905440\u516c\u5206\u30022009\u5e74\u6642\uff0c\u79d1\u585e\u9ad8247\u516c\u5206\uff0c\u5230\u4e862012\u5e74\uff0c\u4ed6\u9ad8\u4e864\u516c\u5206\uff0c\u9054\u5230251\u516c\u5206\u3002\n", "\n", "i:2\n", "\u746a\u9e97\u84ee\u00b7\u6c83\u65af\u00b7\u838e\u51e1\u7279\n", "\u746a\u9e97\u84ee\u00b7\u6c83\u65af\u00b7\u838e\u51e1\u7279\uff08Marilyn vos Savant\uff0c\uff09\u66fe\u7d93\u88ab\u8a18\u8f09\u70ba\u5409\u5c3c\u65af\u4e16\u754c\u8a18\u9304\u6240\u8a8d\u5b9a\u64c1\u6709\u6700\u9ad8\u667a\u5546\u7684\u4eba\u985e\u53ca\u5973\u6027 (1984 to 1989)\u3002\u5979\u65bc1946\u5e74\u51fa\u751f\u65bc\u7f8e\u570b\u5bc6\u82cf\u91cc\u5dde\u7684\u5723\u8def\u6613\u65af\uff0c\u746a\u9e97\u84ee\u5728\u525b\u6eff10\u6b72\u76841956\u5e749\u6708\u6642\u521d\u6b21\u63a5\u53d7\u53f2\u4e39\u798f-\u6bd4\u5948\u667a\u529b\u6e2c\u9a57 \uff08\u5fc3\u667a\u5e74\u9f61\u6bd4\u4f8b\u667a\u5546\uff09\uff0c\u6e2c\u5f97\u667a\u5546\u9ad8\u9054228\uff0c\u4e26\u767b\u4e0a\u4e16\u754c\u7d00\u9304\u3002\u7136\u800c\uff0c\u667a\u5546\u7684\u5224\u5b9a\u8207\u6bd4\u8f03\u65b9\u5f0f\u5f8c\u4f86\u906d\u5230\u722d\u8b70\uff0c \u96a8\u5f8c\u5409\u5c3c\u65af\u4e16\u754c\u8a18\u9304\u57281990\u5e74\u79fb\u9664\u4e86\u201c\u667a\u5546\u6700\u9ad8\u7684\u4eba\u201d\u9019\u500b\u9805\u76ee\u3002\n", "\n", "i:3\n", "\u827e\u5fb7\u8499\u00b7\u5e0c\u62c9\u91cc\n", "\u827e\u5fb7\u8499\u00b7\u73c0\u897f\u74e6\u5c14\u00b7\u5e0c\u62c9\u91cc\u7235\u58eb\uff0cKG\uff0cONZ\uff0cKBE\uff08Edmund Percival Hillary\uff0c\uff09\uff0c\u662f\u7d10\u897f\u862d\u767b\u5c71\u5bb6\u548c\u63a2\u96aa\u5bb6\uff0c\u5728\u548c\u96ea\u5df4\u4eba\u56ae\u5c0e\u4e39\u589e\u00b7\u8bfa\u76d6\u7684\u5408\u4f5c\u4e4b\u4e0b\uff0c\u4ed6\u548c\u4e39\u589e\u00b7\u8bfa\u76d6\u6210\u4e86\u53ef\u8b49\u660e\u7684\u8a18\u9304\u4e2d\u6700\u65e9\u6210\u529f\u6500\u767b\u73e0\u7a46\u6717\u746a\u5cf0\u5cf0\u9802\u7684\u4eba\u3002\n", "\n", "i:4\n", "\u827e\u746a\u00b7\u83ab\u62c9\u8afe\n", "\u827e\u746a\u00b7\u99ac\u4e01\u5a1c\u00b7\u9732\u6613\u5409\u4e9e\u00b7\u83ab\u62c9\u8afe\uff08\uff0c\uff09\uff0c\u751f\u65bc\u610f\u5927\u5229\u5947\u7dad\u4e9e\u65af\u79d1\uff0c\u8d85\u7d1a\u4eba\u745e\uff0c\u66fe\u662f\u4e16\u754c\u6700\u5e74\u9577\u8005\uff08\u4e16\u754c\u7d00\u9304\u7b2c5\u540d\uff09\u548c1890\u5e74\u4ee3\u6700\u5f8c1\u4f4d\u53bb\u4e16\u7684\u4eba\u3002\u9664\u6b64\u4e4b\u5916\uff0c\u5979\u4e5f\u662f\u6b50\u6d32\u7684\u7b2c3\u5e74\u9577\u8005\uff08\u4eab\u5d69\u58fd117\u6b72\u53c8137\u5929\uff09\uff0c\u50c5\u6b21\u65bc\u96c5\u5a1c\u00b7\u5361\u723e\u66fc\u7279\uff08\u4eab\u5d69\u58fd122\u6b72164\u5929\uff09\u548c\u9732\u897f\u723e\u00b7\u6717\u6771\uff08\u751f\u65bc1904\u5e742\u670811\u65e5\uff09\u3002\n", "\n"]}], "source": ["for i, result in enumerate(results_zh):\n", " print(f\"i:{i}\")\n", " print(result.get('title'))\n", " print(result.get('text'))\n", " print()"]}, {"cell_type": "markdown", "id": "8f769bf4", "metadata": {}, "source": ["\u6211\u4eec\u53d1\u73b0\u8fd9\u91cc\u5df2\u7ecf\u83b7\u5230\u4e86\u6b63\u786e\u7684\u7b54\u6848\uff1a\u5386\u53f2\u4e0a\u6700\u9ad8\u7684\u4eba\u662f\u7f57\u4f2f\u7279\u00b7\u6c83\u5fb7\u6d1b\u3002\u800c\u4e14\u8fd8\u67e5\u8be2\u5230\u4e86\u5176\u4ed6\u6587\u4ef6\u3002\n", "\n", "\u4e0d\u8fc7\uff0c\u6211\u4eec\u4ecd\u7136\u53ef\u4ee5\u4f7f\u7528\u91cd\u6392\u6765\u5e2e\u52a9\u6211\u4eec\u3002\n", "\n", "\u5f53\u6211\u4eec\u5bf9\u8fd9\u4e9b\u7ed3\u679c\u5e94\u7528\u91cd\u6392\u65f6\u4f1a\u53d1\u751f\u4ec0\u4e48\u5462\uff1f\u8ba9\u6211\u4eec\u518d\u6b21\u8c03\u7528\u91cd\u65b0\u6392\u540d\u51fd\u6570\uff0c\u5b83\u5c06\u4f1a\u7ed9\u51fa\u67e5\u8be2\u6587\u672c\u7684\u76f8\u5173\u6027\u5e76\u5bf9\u7ed3\u679c\u8fdb\u884c\u91cd\u65b0\u6392\u5e8f\u3002"]}, {"cell_type": "code", "execution_count": 114, "id": "d269db28-15aa-426a-a993-14275a36ca09", "metadata": {}, "outputs": [], "source": ["texts = [result.get('text') for result in results]\n", "reranked_text = rerank_responses(query_2, texts)"]}, {"cell_type": "code", "execution_count": 115, "id": "aa7aca9b-bdc0-4c08-9615-1a7408854cb4", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "RerankResult\n", "\n", "i:1\n", "RerankResult\n", "\n", "i:2\n", "RerankResult\n", "\n", "i:3\n", "RerankResult\n", "\n", "i:4\n", "RerankResult\n", "\n"]}], "source": ["for i, rerank_result in enumerate(reranked_text):\n", " print(f\"i:{i}\")\n", " print(f\"{rerank_result}\")\n", " print()"]}, {"cell_type": "code", "execution_count": 121, "id": "b1a4efdd", "metadata": {}, "outputs": [], "source": ["results_zh = dense_retrieval(query_2_zh, client, results_lang=\"zh\")"]}, {"cell_type": "code", "execution_count": 123, "id": "0c0b6d83", "metadata": {}, "outputs": [], "source": ["texts_zh = [result.get('text') for result in results_zh]\n", "reranked_text_zh = rerank_responses_zh(query_2_zh, texts_zh)"]}, {"cell_type": "code", "execution_count": 124, "id": "0ae6e68a", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["i:0\n", "RerankResult\n", "\n", "i:1\n", "RerankResult\n", "\n", "i:2\n", "RerankResult\n", "\n", "i:3\n", "RerankResult\n", "\n", "i:4\n", "RerankResult\n", "\n"]}], "source": ["for i, rerank_result in enumerate(reranked_text_zh):\n", " print(f\"i:{i}\")\n", " print(f\"{rerank_result}\")\n", " print()"]}, {"cell_type": "markdown", "id": "d539546c", "metadata": {}, "source": ["\u6211\u4eec\u53d1\u73b0\u91cd\u6392\u5f97\u5230\u7684\u7ed3\u679c\u4e2d\u786e\u5b9e\u662f\u4e0e\u7f57\u4f2f\u7279\u00b7\u6c83\u5fb7\u6d1b\u662f\u76f8\u5173\u6027\u6700\u9ad8\u7684\u90a3\u4e2a\uff0c\u4e3a 0.97\u3002\u5bf9\u4e8e\u5176\u4ed6\u6587\u7ae0\uff0c\u5b83\u7ed9\u51fa\u7684\u76f8\u5173\u6027\u5e76\u4e0d\u9ad8\u3002\n", "- PS: \u4e2d\u6587\u652f\u6301\u786e\u5b9e\u4e0d\u592a\u884c"]}, {"cell_type": "markdown", "id": "554b9838", "metadata": {}, "source": ["\u91cd\u65b0\u6392\u540d\u5e2e\u52a9\u6211\u4eec\u786e\u5b9a\u5728\u7a20\u5bc6\u68c0\u7d22\u51fa\u73b0\u7684\u7b54\u6848\u4e2d\u54ea\u4e00\u4e2a\u624d\u662f\u6b63\u786e\u7684\u7b54\u6848\u3002\u73b0\u5728\uff0c\u6211\u4eec\u9f13\u52b1\u60a8\u5728\u8fd9\u91cc\u6682\u505c\uff0c\u5c1d\u8bd5\u81ea\u5df1\u7684\u4f8b\u5b50\u3002\u7528\u81ea\u5df1\u7684\u67e5\u8be2\uff0c\u627e\u5230\u641c\u7d22\u7ed3\u679c\uff0c\u7136\u540e\u4f7f\u7528\u91cd\u6392\u627e\u5230\u6b63\u786e\u7684\u7b54\u6848\u3002\n"]}, {"cell_type": "markdown", "id": "4479827b", "metadata": {}, "source": ["### 4.2 \u641c\u7d22\u7cfb\u7edf\u7684\u8bc4\u4f30"]}, {"cell_type": "markdown", "id": "ac052b00", "metadata": {}, "source": ["\n", "![](images/5-4.png)\n", "\n", "\n", "\u65e2\u7136\u6211\u4eec\u6709\u4e86\u6240\u6709\u8fd9\u4e9b\u641c\u7d22\u7cfb\u7edf\uff0c\u60a8\u53ef\u80fd\u60f3\u77e5\u9053\u5982\u4f55\u8bc4\u4f30\u5b83\u4eec\u3002\u6709\u591a\u79cd[\u8bc4\u4f30\u65b9\u6cd5](https://zhuanlan.zhihu.com/p/351986117)\uff0c\u5982:\n", "\n", "- \u5e73\u5747\u7cbe\u5ea6\uff08MAP\uff09\n", "- \u5e73\u5747\u5012\u6570\u6392\u540d\uff08MRR\uff09\n", "- \u5f52\u4e00\u5316\u6298\u51cf\u7d2f\u79ef\u589e\u76ca\uff08NDCG\uff09\n", " \n", "\u90a3\u4e48\uff0c\u5982\u4f55\u521b\u5efa\u4e00\u4e2a\u6d4b\u8bd5\u96c6\u6765\u8bc4\u4f30\u8fd9\u4e9b\u6a21\u578b\u5462\uff1f\u4e00\u4e2a\u4f18\u8d28\u7684\u6d4b\u8bd5\u96c6\u5e94\u8be5\u5305\u542b\u67e5\u8be2\u548c\u6b63\u786e\u7684\u54cd\u5e94\u3002\u7136\u540e\uff0c\u60a8\u53ef\u4ee5\u5c06\u8fd9\u4e9b\u6b63\u786e\u7684\u54cd\u5e94\u4e0e\u6a21\u578b\u7ed9\u51fa\u7684\u54cd\u5e94\u8fdb\u884c\u6bd4\u8f83\uff0c\u5c31\u50cf\u8bc4\u4f30\u5206\u7c7b\u6a21\u578b\u7684\u51c6\u786e\u6027\u3001\u7cbe\u786e\u5ea6\u6216\u53ec\u56de\u7387\u4e00\u6837\u3002\u5982\u679c\u60a8\u60f3\u4e86\u89e3\u66f4\u591a\u5173\u4e8e\u8bc4\u4f30\u641c\u7d22\u7cfb\u7edf\u7684\u4fe1\u606f\uff0c\u6211\u4eec\u5c06\u4f1a\u5728\u8d44\u6e90\u4e2d\u63d0\u4f9b\u4e00\u4e9b\u6587\u7ae0\u94fe\u63a5\u4f9b\u60a8\u53c2\u8003\u3002\n", "\n", "\u73b0\u5728\uff0c\u60a8\u5df2\u7ecf\u5b66\u4f1a\u4f7f\u7528\u641c\u7d22\u548c\u91cd\u6392\u6765\u68c0\u7d22\u5305\u542b\u7279\u5b9a\u95ee\u9898\u7b54\u6848\u7684\u6587\u6863\u3002\u5728\u4e0b\u4e00\u8bfe\u4e2d\uff0c\u60a8\u5c06\u5b66\u4e60\u4e00\u4e9b\u66f4\u6709\u8da3\u7684\u5185\u5bb9\u3002\u60a8\u5c06\u5b66\u4e60\u5982\u4f55\u7ed3\u5408\u641c\u7d22\u7cfb\u7edf\u548c\u751f\u6210\u6a21\u578b\uff0c\u4ee5\u4fbf\u4ee5\u4eba\u7c7b\u7684\u65b9\u5f0f\u8f93\u51fa\u67e5\u8be2\u7684\u7b54\u6848\u3002"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6"}}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file +{"cells":[{"cell_type":"markdown","id":"0ead654a","metadata":{},"source":["# 第五章、重排\n","\n"," - [一、配置环境](#一、配置环境)\n"," - [二、稠密检索](#二、稠密检索)\n"," - [2.1 稠密检索的不足](#2.1-稠密检索的不足)\n"," - [2.2 重排模型的训练方式](#2.2-重排模型的训练方式)\n"," - [三、使用重排改进关键词检索](#三、使用重排改进关键词检索)\n"," - [四、使用重排改进稠密检索](#四、使用重排改进稠密检索)\n"," - [4.1 进一步理解重排](#4.1-进一步理解重排)\n"," - [4.2 搜索系统的评估](#4.2-搜索系统的评估)\n"]},{"cell_type":"markdown","id":"b9335837","metadata":{},"source":["关键词检索和稠密检索的主要任务是寻找相关的结果,它们返回的分数只是根据某种度量找到的相关结果,并不能完全反映结果和查询之间的真实相关性。\n","\n","重排(Rerank)是一种优化关键词检索和稠密检索结果的方法。它是语义检索中除了稠密检索外的重要组成部分。Rerank 让大型语言模型按照与查询相关性对搜索结果从高到低排序。这种方法可以利用语义信息来更准确地评估文档和查询之间的相关性,从而提高搜索结果的质量。"]},{"cell_type":"markdown","id":"99f6a6f7","metadata":{},"source":["## 一、配置环境\n","\n","让我们先准备好需要用到的一些 Python 库和 API:"]},{"cell_type":"code","execution_count":null,"id":"f350cd1b","metadata":{},"outputs":[],"source":["!pip install cohere \n","!pip install weaviate-client"]},{"cell_type":"code","execution_count":3,"id":"b2febbb9-27dd-4209-838a-99b4f9cdf51b","metadata":{},"outputs":[],"source":["import os\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) # 读取本地 .env 文件"]},{"cell_type":"code","execution_count":4,"id":"dab2ecba-3403-4317-86ef-bd6d92a6cb46","metadata":{},"outputs":[],"source":["import cohere\n","co = cohere.Client(os.environ['COHERE_API_KEY'])"]},{"cell_type":"markdown","id":"3944cd92","metadata":{},"source":["创建连接存储所有维基百科条目数据库的客户端。"]},{"cell_type":"code","execution_count":5,"id":"30737b1b-e4c8-4bd0-a04b-c2ce70d28821","metadata":{},"outputs":[],"source":["import weaviate\n","\n","# 连接到包含 10M 维基百科的用于网络演示的向量数据库\n","# 使用一个公共的拥有只读权限的API键\n","auth_config = weaviate.auth.AuthApiKey(\n"," api_key=os.environ['WEAVIATE_API_KEY']) # \"76320a90-53d8-42bc-b41d-678647c6672e\""]},{"cell_type":"code","execution_count":6,"id":"8781f638-17c7-4ab7-86b5-3763d4d5abad","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/zhihu123/Library/Python/3.9/lib/python/site-packages/weaviate/warnings.py:158: DeprecationWarning: Dep016: You are using the Weaviate v3 client, which is deprecated.\n"," Consider upgrading to the new and improved v4 client instead!\n"," See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python\n"," \n"," warnings.warn(\n"]}],"source":["client = weaviate.Client(\n"," url=os.environ['WEAVIATE_API_URL'],\n"," auth_client_secret=auth_config,\n"," additional_headers={\n"," \"X-Cohere-Api-Key\": os.environ['COHERE_API_KEY'],\n"," }\n",")"]},{"cell_type":"markdown","id":"ffcc8e5e","metadata":{},"source":["## 二、稠密检索"]},{"cell_type":"markdown","id":"2f678341","metadata":{},"source":["### 2.1 稠密检索的不足"]},{"cell_type":"markdown","id":"9d6445d5","metadata":{},"source":["首先我们调用上节课的 `dense_retrieval` 函数,查看稠密检索的结果"]},{"cell_type":"code","execution_count":7,"id":"b8561fbf-035e-4856-a97f-8eda21d32a81","metadata":{},"outputs":[],"source":["from utils import dense_retrieval"]},{"cell_type":"code","execution_count":8,"id":"1822cc6c-ddc2-4938-b746-7cda2506d51e","metadata":{},"outputs":[],"source":["from utils import print_result"]},{"cell_type":"code","execution_count":9,"id":"a0b0830e","metadata":{},"outputs":[],"source":["query_1 = \"What is the capital of Canada?\""]},{"cell_type":"code","execution_count":10,"id":"09ba30f0","metadata":{},"outputs":[],"source":["dense_retrieval_results = dense_retrieval(query_1, client)"]},{"cell_type":"code","execution_count":11,"id":"2990c5c4-1b63-453e-8dd8-8568cb7872f5","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["item 0\n","_additional:{'distance': -150.8031}\n","\n","lang:en\n","\n","text:The governor general of the province had designated Kingston as the capital in 1841. However, the major population centres of Toronto and Montreal, as well as the former capital of Lower Canada, Quebec City, all had legislators dissatisfied with Kingston. Anglophone merchants in Quebec were the main group supportive of the Kingston arrangement. In 1842, a vote rejected Kingston as the capital, and study of potential candidates included the then-named Bytown, but that option proved less popular than Toronto or Montreal. In 1843, a report of the Executive Council recommended Montreal as the capital as a more fortifiable location and commercial centre, however, the Governor General refused to execute a move without a parliamentary vote. In 1844, the Queen's acceptance of a parliamentary vote moved the capital to Montreal.\n","\n","title:Ottawa\n","\n","url:https://en.wikipedia.org/wiki?curid=22219\n","\n","views:2000\n","\n","\n","item 1\n","_additional:{'distance': -150.28354}\n","\n","lang:en\n","\n","text:For brief periods, Toronto was twice the capital of the united Province of Canada: first from 1849 to 1852, following unrest in Montreal, and later 1856–1858. After this date, Quebec was designated as the capital until 1866 (one year before Canadian Confederation). Since then, the capital of Canada has remained Ottawa, Ontario.\n","\n","title:Toronto\n","\n","url:https://en.wikipedia.org/wiki?curid=64646\n","\n","views:3000\n","\n","\n","item 2\n","_additional:{'distance': -150.02524}\n","\n","lang:en\n","\n","text:Selection of Ottawa as the capital of Canada predates the Confederation of Canada. The selection was contentious and not straightforward, with the parliament of the United Province of Canada holding more than 200 votes over several decades to attempt to settle on a legislative solution to the location of the capital.\n","\n","title:Ottawa\n","\n","url:https://en.wikipedia.org/wiki?curid=22219\n","\n","views:2000\n","\n","\n","item 3\n","_additional:{'distance': -149.92365}\n","\n","lang:en\n","\n","text:Until the late 18th century Québec was the most populous city in present-day Canada. As of the census of 1790, Montreal surpassed it with 18,000 inhabitants, but Quebec (pop. 14,000) remained the administrative capital of New France. It was then made the capital of Lower Canada by the Constitutional Act of 1791. From 1841 to 1867, the capital of the Province of Canada rotated between Kingston, Montreal, Toronto, Ottawa and Quebec City (from 1852 to 1856 and from 1859 to 1866).\n","\n","title:Quebec City\n","\n","url:https://en.wikipedia.org/wiki?curid=100727\n","\n","views:2000\n","\n","\n","item 4\n","_additional:{'distance': -149.71033}\n","\n","lang:en\n","\n","text:The Quebec Conference on Canadian Confederation was held in the city in 1864. In 1867, Queen Victoria chose Ottawa as the definite capital of the Dominion of Canada, while Quebec City was confirmed as the capital of the newly created province of Quebec.\n","\n","title:Quebec City\n","\n","url:https://en.wikipedia.org/wiki?curid=100727\n","\n","views:2000\n","\n","\n"]}],"source":["print_result(dense_retrieval_results)"]},{"cell_type":"markdown","id":"385d45ad","metadata":{},"source":["注:经过测试,发现当前数据库中文预料可能较少,对中文检索比较简单,所以对查询(query)进行了简化。(例如只保留关键词,类似主语)"]},{"cell_type":"code","execution_count":134,"id":"15694a5c-3525-49cc-b5e9-d1c34ae0fbe9","metadata":{},"outputs":[],"source":["query_1 = \"加拿大首都\""]},{"cell_type":"code","execution_count":135,"id":"6dfede25-8a43-41c9-9328-d331695c4fcb","metadata":{},"outputs":[],"source":["dense_retrieval_results = dense_retrieval(query_1, client, 'zh')"]},{"cell_type":"code","execution_count":136,"id":"1bb57590","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["item 0\n","_additional:{'distance': -152.25616}\n","\n","lang:zh\n","\n","text:18世纪晚期之前,魁北克城一直是加拿大人口最多的城市。在1790年的普查期间,蒙特利尔以18,000居民超过了魁北克,但魁北克(14,000人口)依然保住了新法兰西行政首府的地位。在1791年宪法中,魁北克城成为下加拿大的首府。从1841年到1867年,加拿大省的首府在几个城市之间轮替,包括金士顿,蒙特利尔,多伦多,渥太华和魁北克(1852年到1856年,1859年到1866年)。\n","\n","title:魁北克市\n","\n","url:https://zh.wikipedia.org/wiki?curid=117192\n","\n","views:600\n","\n","\n","item 1\n","_additional:{'distance': -150.94444}\n","\n","lang:zh\n","\n","text:渥太華()是加拿大的聯邦首都,全國第四大城市,市區人口是934,243人,首都圈地區是1,323,783人(根據2016年人口普查),面積2,779平方公里,位於安大略省東南部,渥太華河南岸,多倫多以東400公里,蒙特利爾以西190公里。與美國、澳大利亞等國不同,渥太華不是聯邦直轄的行政區,但是渥太華土地管理和城市規劃是由國家首都委員會(National Capital Commission)負責。\n","\n","title:渥太華\n","\n","url:https://zh.wikipedia.org/wiki?curid=70236\n","\n","views:800\n","\n","\n","item 2\n","_additional:{'distance': -150.90271}\n","\n","lang:zh\n","\n","text:1857年12月31日,维多利亚女王选择渥太华为加拿大省的首都(包括现在的安大略和魁北克)。虽然现代的渥太华是加拿大第四大城市,但在当年,她仅仅是一个木材贸易通道中的内陆小镇,并且距离殖民地的几个主要城市(东部的蒙特利尔和魁北克城;西部的多伦多和京士頓)路途遥远。女王的顾问们建议渥太华成为首都之选有两大重要理由:首先,渥太华是唯一具有一定规模、并且位于加拿大省东西部边界地(现安大略与魁北克边界)的城市,定都于此是平衡两个殖民地及其英裔、法裔居民的聪明妥协之举;其次,1812年战争表明,其他主要城市容易受到美国人的攻击,因为过于靠近美加边界。渥太华位于腹地,易于防守,渥太华河及丽都运河使之与加拿大东西部之间交通极为便利。另外两个方面的考虑是:渥太华正好介于多伦多和魁北克城之间(距离这两个城市都是500公里),并且城市规模较小,因而不容易受到大规模的暴徒袭击,因为政治动机,以往的首都城市都受到过这种攻击。\n","\n","title:渥太華\n","\n","url:https://zh.wikipedia.org/wiki?curid=70236\n","\n","views:800\n","\n","\n","item 3\n","_additional:{'distance': -150.68478}\n","\n","lang:zh\n","\n","text:多伦多(,),是北美洲国家加拿大安大略省首府,加拿大的最大城市。多伦多坐落在安大略湖西北岸的南安大略地区。根据2021年的加拿大人口普查,多伦多市人口达2,794,356人,为加拿大最大城市。多伦多市是大多伦多地区的心脏地区,也是安大略省南部人口稠密区(称作“金馬蹄地區”)的一部分。都會区有6,202,225名居民,而覆蓋範圍較廣的大多倫多地區則有9,765,188名居民。作為加拿大的经济中心,多伦多是一個世界级城市,也是世界上最大的金融中心之一。多伦多在经济上的领先地位在于金融、商业服务、电信、航太、交通运输、媒体、艺术、电影、电视製作、出版、软件、医药研究、教育、旅游、体育等产业。多伦多证券交易所是世界第七大交易所,总部设于市内,有多数加拿大公司在这里上市。\n","\n","title:多伦多\n","\n","url:https://zh.wikipedia.org/wiki?curid=3132\n","\n","views:1000\n","\n","\n","item 4\n","_additional:{'distance': -150.47894}\n","\n","lang:zh\n","\n","text:蒙特婁曾经是加拿大经济首都,拥有最多的人口及最发达的经济,但是在1976年蒙特婁奧運會后被安大略省的多伦多超过。今天蒙特利尔仍然是加拿大最重要的经济中心之一,人工智慧、航空工业、金融、设计、电影工业等行业发达。蒙特婁被认为是世界最佳宜居城市,并被联合国教育、科学及文化组织认定为设计之城。1999年第35屆國際技能競賽在這裡舉行。\n","\n","title:蒙特利尔\n","\n","url:https://zh.wikipedia.org/wiki?curid=43791\n","\n","views:1000\n","\n","\n"]}],"source":["print_result(dense_retrieval_results)"]},{"cell_type":"markdown","id":"88200092","metadata":{},"source":["让我们查看检索结果:\n","\n","结果中第二个是正确的,是渥太华。有一些不再是正确答案的结果。多伦多不是加拿大的首都。然后,我们还有魁北克市,这是错误的答案。为什么会发生这种情况呢?\n","\n","通过一个小例子来帮助理解这个概念。虽然和当前的搜索结果有点不同,但有助于我们理解这个情况。\n","\n","假设查询的问题是“加拿大的首都是什么?”,可能的回答有以下五个:\n","\n","- 加拿大的首都是渥太华:这是正确的。\n","\n","- 多伦多位于加拿大:这也是正确的,但与问题无关。\n","\n","- 法国的首都是巴黎:这也是正确的,但不是问题的答案。\n","\n","- 加拿大的首都是悉尼:这是不正确的。\n","\n","- 安大略的省会是多伦多:这是正确的,但同样未能回答问题。\n","\n","\n","![Dense Retrieval is also not perfect](images/5-1.png)"]},{"cell_type":"markdown","id":"4200b1f2","metadata":{},"source":["进行稠密检索时会发生什么呢?\n","\n","我们假设五个句子在 embedding 空间的分布如图所示。稠密检索的原理是将查询 生成 embedding,然后返回与之最接近的内容,即“安大略的首都是多伦多”。稠密检索看重**语义相似性**,因此它返回与问题最相似的内容。但这可能不是正确的答案,甚至可能不是真实的陈述,它只是一个在语义上与问题接近的句子。因此,稠密检索有可能返回的并非答案。我们如何修复这个问题呢?这就是重排起作用的地方。"]},{"cell_type":"markdown","id":"acdc556d","metadata":{},"source":["![rerank](images/5-2.png)"]},{"cell_type":"markdown","id":"ff2ff78a","metadata":{},"source":["假设查询是“加拿大的首都是什么”,此时有10个可能的答案,其中一些与问题相关,而另一些则不相关。因此,当我们使用稠密检索时,它会给我们与查询最相似的五个答案,也就是与查询最相似的五个内容。假设返回内容就是绿色的这些句子。现在我们有五个与查询非常接近的句子,但我们不知道哪一个才是正确答案。这就是 Rerank 发挥作用的地方。\n","\n","重排模型为每个查询结果对打一个相关得分,告诉您答案相对于查询的相关程度。这 5 个句子最高相关性为 0.9,对应于“加拿大的首都是渥太华”,这就是正确的答案。这就是重排的作用。"]},{"cell_type":"markdown","id":"f8f5cbaf","metadata":{},"source":["### 2.2 重排模型的训练方式"]},{"cell_type":"markdown","id":"68c76221","metadata":{},"source":["![rerank_training](images/5-3.png)"]},{"cell_type":"markdown","id":"cc62573b","metadata":{},"source":["重排模型的训练需要大量的高质量样本,这些样本包括与查询高度相关的响应或文档。训练的目标是使模型能够给出高相关性的得分。同时,我们也需要提供一些错误的查询响应作为样本,这些响应可能与查询不完全匹配,可能是接近但不符合的情况,或者是一个可能与查询不匹配的文档。通过训练模型对优质的查询响应给出高分,对不理想的查询响应给出低分,以此获得一个能够分配相关性的重排模型。当查询和响应高度相关时,该模型将给出高得分。"]},{"cell_type":"markdown","id":"db449134","metadata":{},"source":["## 三、使用重排改进关键词检索"]},{"cell_type":"markdown","id":"e15d7130","metadata":{},"source":["我们将导入之前在第一课中使用的关键词检索函数。再次问它,“加拿大的首都是什么”"]},{"cell_type":"code","execution_count":12,"id":"8071c68a-6dec-47f9-b5e1-473f9acdc83f","metadata":{},"outputs":[],"source":["from utils import keyword_search"]},{"cell_type":"code","execution_count":27,"id":"e851efa5-10c7-4f98-85f1-2a1c565d9723","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","Monarchy of Canada\n","In his 1990 book, \"Continental Divide: the Values and Institutions of the United States and Canada,\" Seymour Martin Lipset argues that the presence of the monarchy in Canada helps distinguish Canadian identity from American identity. Since at least the 1930s, supporters of the Crown have held the opinion that the Canadian monarch is also one of the rare unified elements of Canadian society, focusing both \"the historic consciousness of the nation\" and various forms of patriotism and national love \"[on] the point around which coheres the nation's sense of a continuing personality\". Former Governor General Vincent Massey articulated in 1967 that the monarchy \"is part of ourselves. It is linked in a very special way with our national life. It stands for qualities and institutions which mean Canada to every one of us and which for all our differences and all our variety have kept Canada Canadian.\" But, according to Arthur Bousfield and Gary Toffoli, Canadians were, through the late 1960s to the 2000s, encouraged by the federal government to \"neglect, ignore, forget, reject, debase, suppress, even hate, and certainly treat as foreign what their parents and grandparents, whether spiritual or blood, regarded as the basis of Canadian nationhood, autonomy, and history\", including the monarchy. Former Governor General Roland Michener said in 1970 that anti-monarchists claimed the Canadian Crown is foreign and incompatible with Canada's multicultural society, which the government promoted as a Canadian identifier, and Lawrence Martin called in 2007 for Canada to become a republic in order to \"re-brand the nation\". However, Michener also stated, \"[the monarchy] is our own by inheritance and choice, and contributes much to our distinctive Canadian identity and our chances of independent survival amongst the republics of North and South America.\" Journalist Christina Blizzard emphasized in 2009 that the monarchy \"made [Canada] a haven of peace and justice for immigrants from around the world\", while Michael Valpy contended in 2009 that the Crown's nature permitted non-conformity amongst its subjects, thereby opening the door to multiculturalism and pluralism.\n","i:1\n","Early modern period\n","North America outside the zone of Spanish settlement was a contested area in the 17th century. Spain had founded small settlements in Florida and Georgia but nowhere near the size of those in New Spain or the Caribbean islands. France, The Netherlands, and Great Britain held several colonies in North America and the West Indies from the 17th century, 100 years after the Spanish and Portuguese established permanent colonies. The British colonies in North America were founded between 1607 (Virginia) and 1733 (Georgia). The Dutch explored the east coast of North America and began founding settlements in what they called New Netherland (now New York State.). France colonized what is now Eastern Canada, founding Quebec City in 1608. France's loss in the Seven Years' War resulted in the transfer of New France to Great Britain. The Thirteen Colonies, in lower British North America, rebelled against British rule in 1775, largely due to the taxation that Great Britain was imposing on the colonies. The British colonies in Canada remained loyal to the crown, and a provisional government formed by the Thirteen Colonies proclaimed their independence on July 4, 1776 and subsequently became the original 13 United States of America. With the 1783 Treaty of Paris ending the American Revolutionary War, Britain recognised the former Thirteen Colonies' independence.\n","i:2\n","Flag of Canada\n","By the Second World War, the Red Ensign was viewed as Canada's \"de facto\" national flag. A joint committee of the Senate and House of Commons was appointed on November 8, 1945, to recommend a national flag to officially adopt. It received 2,409 designs from the public and was addressed by the director of the Historical Section of the Canadian Army, Fortescue Duguid, who pointed out that red and white were Canada's official colours and there was already an emblem representing the country: three joined maple leaves seen on the escutcheon of the Canadian coat of arms. By May 9 the following year, the committee reported back with a recommendation \"that the national flag of Canada should be the Canadian red ensign with a maple leaf in autumn golden colours in a bordered background of white\". The Legislative Assembly of Quebec had urged the committee to not include any of what it deemed as \"foreign symbols\", including the Union Flag, and Mackenzie King, then still prime minister, declined to act on the report; fearing it may lead to political instability. As a result, the Union Flag was kept as a national flag, and the order to fly the Canadian Red Ensign at government buildings was maintained.\n"]}],"source":["query_1 = \"What is the capital of Canada?\"\n","results = keyword_search(query_1,\n"," client,\n"," properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n"," num_results=3\n"," )\n","\n","for i, result in enumerate(results):\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," print(result.get('text'))"]},{"cell_type":"code","execution_count":169,"id":"b1b88822","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","首都\n","每一個國家通常只設立一個首都,因為政府通常會將其重要機關集中在首都地區,以方便政府高層行政和管理,但亦有例外。一些國家有多個首都,一些甚至沒有。有時候,實際的首都和法定的首都由於某些原因並不在同一個城市。譬如,一個稱為「首都」的城市,實際上並非中央政府所在地。反之,所謂的正式「首都」雖然是中央政府的所在地,但可能不是政治決策的地理中心。故此,「行政首都」一般被認定為是該國的「國家首都」。\n","i:1\n","幻想戰記\n","遊戲中存在六個大陸,各個大陸的地圖之間沒有物理連接,地圖與地圖之間的移動方式為點選各個大陸上稱為「戰場」或者「首都」的據點。其中各個「戰場」是可以宣戰的地圖,而「首都」則不能被進攻(也就是說就算任何一個國家的本土被侵佔完畢該國也不會滅國)。\n","i:2\n","首都\n","首都,作為國家政治、經濟、文化的會聚並不是永恆不變的。在古代,國家一般採取中央集權政策,地方勢力有限;首都一旦淪陷,就意味著朝代的覆亡。中國三國時代,蜀漢、吳因失去各自的首都──成都和建業(今南京)而亡國。\n"]}],"source":["query_2_zh = \"加拿大 首都\"\n","results_zh = keyword_search(query_2_zh,\n"," client,\n"," results_lang='zh',\n"," properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n"," num_results=3\n"," )\n","\n","for i, result in enumerate(results_zh):\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," print(result.get('text'))"]},{"cell_type":"markdown","id":"cfcd3ed4","metadata":{},"source":["- 英文答案输出的前三个答案并不理想。它们涉及加拿大的君主制、早期现代时期和加拿大国旗。\n","- 中文答案输出的前三个答案相关性更差。它们只考虑了首都,没有关于加拿大的信息。\n","\n","为什么它们会这样呢?因为关键词检索仅仅是在查找与查询有许多共同单词的文档,但无法真正判断出是否这些文档确实在回答问题。所有这些文章都包含与查询有很多共同单词,但它们并非答案。"]},{"cell_type":"markdown","id":"7268065b","metadata":{},"source":["让我们扩大下检索规模,要求它返回 500 个结果。为了便于观测,这里不打印文本,只打印标题。"]},{"cell_type":"code","execution_count":13,"id":"6e1b2d2c","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","Monarchy of Canada\n","i:1\n","Early modern period\n","i:2\n","Flag of Canada\n","i:3\n","Flag of Canada\n","i:4\n","Prime Minister of Canada\n","i:5\n","Hamilton, Ontario\n","i:6\n","Liberal Party of Canada\n","i:7\n","Stephen Harper\n","i:8\n","Monarchy of Canada\n","i:9\n","Flag of Canada\n"]}],"source":["query_1 = \"What is the capital of Canada?\"\n","results = keyword_search(query_1,\n"," client,\n"," properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n"," num_results=500\n"," )\n","\n","for i, result in enumerate(results[:10]): # 您可以自行调整输出的标题数量\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," #print(result.get('text'))"]},{"cell_type":"code","execution_count":14,"id":"74dd95e6","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","首都\n","i:1\n","幻想戰記\n","i:2\n","首都\n","i:3\n","首府\n","i:4\n","首都 (香港)\n","i:5\n","中華民國首都\n","i:6\n","西安市\n","i:7\n","首都\n","i:8\n","首都\n","i:9\n","首都\n"]}],"source":["query_1_zh = \"加拿大 首都\"\n","results_zh = keyword_search(query_1_zh,\n"," client,\n"," results_lang='zh',\n"," properties=[\"text\", \"title\", \"url\", \"views\", \"lang\", \"_additional {distance}\"],\n"," num_results=500\n"," )\n","\n","for i, result in enumerate(results_zh[:10]): # 您可以自行调整输出的标题数量\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," #print(result.get('text'))"]},{"cell_type":"markdown","id":"faa3ed9e","metadata":{},"source":["这里有打分最高的前 500个结果。我们如何才能确定这些结果中是否包含答案呢?这就是重排的作用所在。下面这个函数对响应进行重排,并输出打分最高的 10 个。"]},{"cell_type":"code","execution_count":15,"id":"b38761f8-32b1-4b44-be97-0884894cf6b3","metadata":{},"outputs":[],"source":["import cohere\n","def rerank_responses(query, responses, num_responses=10, results_lang='en'):\n"," \"\"\"\n"," 根据给定的查询,使用指定的模型对响应列表进行重排序。\n","\n"," Args:\n"," query (str): 查询。\n"," responses (list): 响应的列表。\n"," num_responses (int, optional): 返回的响应数量,默认为10。\n"," results_lang (str, optional): 指定的语言模型版本,默认为英文(官方只提供英文和多语言两个版本)。\n","\n"," Returns:\n"," list: 重排序后的响应列表。\n"," \"\"\"\n"," \n"," model_name = 'rerank-english-v2.0' if results_lang=='en' else 'rerank-multilingual-v2.0'\n"," \n"," reranked_responses = co.rerank(\n"," model=model_name,\n"," query=query,\n"," documents=responses,\n"," top_n=num_responses,\n"," )\n"," return reranked_responses"]},{"cell_type":"markdown","id":"2d84b454","metadata":{},"source":["现在,让我们将答案的文本上进行重排。"]},{"cell_type":"code","execution_count":null,"id":"02d3e55c-0a5b-4b3a-9a59-3f7164927dc0","metadata":{},"outputs":[],"source":["texts = [result.get('text') for result in results] # 只提取结果中的文本\n","reranked_text = rerank_responses(query_1, texts)"]},{"cell_type":"code","execution_count":32,"id":"6b3a380b-cebf-47da-956d-dc62dc53e5a0","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","RerankResult\n","\n","i:1\n","RerankResult\n","\n","i:2\n","RerankResult\n","\n","i:3\n","RerankResult\n","\n","i:4\n","RerankResult\n","\n","i:5\n","RerankResult\n","\n","i:6\n","RerankResult\n","\n","i:7\n","RerankResult\n","\n","i:8\n","RerankResult\n","\n","i:9\n","RerankResult\n","\n"]}],"source":["for i, rerank_result in enumerate(reranked_text):\n"," print(f\"i:{i}\")\n"," print(f\"{rerank_result}\")\n"," print()"]},{"cell_type":"code","execution_count":172,"id":"9bb71208","metadata":{},"outputs":[],"source":["texts_zh = [result.get('text') for result in results_zh]\n","reranked_text_zh = rerank_responses_zh(query_1_zh, texts_zh)"]},{"cell_type":"code","execution_count":173,"id":"b483507b","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","RerankResult\n","\n","i:1\n","RerankResult\n","\n","i:2\n","RerankResult\n","\n","i:3\n","RerankResult\n","\n","i:4\n","RerankResult\n","\n","i:5\n","RerankResult\n","\n","i:6\n","RerankResult\n","\n","i:7\n","RerankResult\n","\n","i:8\n","RerankResult\n","\n","i:9\n","RerankResult\n","\n"]}],"source":["for i, rerank_result in enumerate(reranked_text_zh):\n"," print(f\"i:{i}\")\n"," print(f\"{rerank_result}\")\n"," print()"]},{"cell_type":"markdown","id":"689dd251","metadata":{},"source":["在输入查询和结果之后,让我们打印出重排的前 10 个结果。"]},{"cell_type":"markdown","id":"bd5e9b15","metadata":{},"source":["请注意,其中获得了正确答案。它确定渥太华作为加拿大的首都,并且相关分数非常高,接近 1,达到 0.98。值得注意的是,排名第二的文章也相当不错,它涉及加拿大历史上不同的首都,其相关分数为 0.97。第三个也很出色。重排从关键词检索出的 10 个答案中挑选出相关性最高的 10。"]},{"cell_type":"markdown","id":"f6cbb081","metadata":{},"source":["## 四、使用重排改进稠密检索"]},{"cell_type":"markdown","id":"9e5ae7b5","metadata":{},"source":["### 4.1 进一步理解重排"]},{"cell_type":"markdown","id":"9d9c0d16","metadata":{},"source":["我将再次使用稠密检索函数,尝试解决一个稍微困难的问题。我们询问:\"谁是历史上最高的人?\" 对于关键词检索来说,这将是一个有挑战性的问题,因为它更关注包含“历史”或“人物”的文章,并不能捕捉到问题的真实意义。我们希望稠密检索可以做得更好。因此,我们将调用稠密检索函数来获取更准确的结果。"]},{"cell_type":"code","execution_count":110,"id":"be2e5378-ea37-4726-b3c3-5875d46759e7","metadata":{},"outputs":[],"source":["from utils import dense_retrieval"]},{"cell_type":"code","execution_count":111,"id":"d5af11ea-6c30-4303-8c9e-8a5510e046bb","metadata":{},"outputs":[],"source":["query_2 = \"Who is the tallest person in history?\""]},{"cell_type":"code","execution_count":112,"id":"4da5c744-01b8-4780-a615-0a5edf9bfbd6","metadata":{},"outputs":[],"source":["results = dense_retrieval(query_2, client)"]},{"cell_type":"code","execution_count":113,"id":"9e4540d8-ed5e-4f97-8802-6d39a52b8964","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","Robert Wadlow\n","Robert Pershing Wadlow (February 22, 1918 July 15, 1940), also known as the Alton Giant and the Giant of Illinois, was a man who was the tallest person in recorded history for whom there is irrefutable evidence. He was born and raised in Alton, Illinois, a small city near St. Louis, Missouri.\n","\n","i:1\n","Manute Bol\n","Bol came from a family of extraordinarily tall men and women. He said: \"My mother was , my father , and my sister is . And my great-grandfather was even taller—.\" His ethnic group, the Dinka, and the Nilotic people of which they are a part, are among the tallest populations in the world. Bol's hometown, Turalei, is the origin of other exceptionally tall people, including basketball player Ring Ayuel. \"I was born in a village, where you cannot measure yourself,\" Bol reflected. \"I learned I was 7 foot 7 in 1979, when I was grown. I was about 18 or 19.\"\n","\n","i:2\n","Sultan Kösen\n","Sultan Kösen (born 10 December 1982) is a Turkish farmer who holds the Guinness World Record for tallest living male at . Of Kurdish ethnicity, he is the seventh tallest man in history.\n","\n","i:3\n","Sultan Kösen\n","Kösen turned 40 years old on 10 December 2022. He celebrated his birthday a few days early by visiting the Ripley's Believe It or Not! museum in Orlando, Florida, USA and posing next to a life-sized statue of Robert Wadlow, the tallest man ever at 272 cm (8 ft 11.1 in).\n","\n","i:4\n","Netherlands\n","The Dutch are the tallest people in the world, by nationality, with an average height of for adult males and for adult females in 2009. The average height of young males in the Netherlands increased from 5 feet, 4 inches to approximately 6 feet between the 1850s until the early 2000s. People in the south are on average about shorter than those in the north.\n","\n"]}],"source":["for i, result in enumerate(results):\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," print(result.get('text'))\n"," print()"]},{"cell_type":"code","execution_count":116,"id":"e940eec6","metadata":{},"outputs":[],"source":["query_2_zh = \"历史上最高的人?\""]},{"cell_type":"code","execution_count":122,"id":"f493fcc8","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","鮑喜順\n","鲍喜顺在2005年、2008年两度獲得吉尼斯世界纪录認證被承認是地球上因自然原因而長得最高的活人(其他擁有世界之最頭銜的長人不是已逝,如罗伯特·瓦德罗,就是因特殊病變而过度发育稱為巨人症,如列昂尼德·斯塔德尼克)。实际上,这一纪录在2007年即被乌克兰人列昂尼德·斯塔德尼克(Leonid Stadnyk)取代,但因其后来拒绝接受吉尼斯世界纪录组织测量身高而并未被该组织继续认可,故鲍喜顺重新成为世界最高人,直至2009年该纪录持有者被土耳其人蘇丹·科塞(Sultan Kösen)取代。\n","\n","i:1\n","蘇丹·科塞\n","蘇丹·科塞(;),出生於土耳其马尔丁,是自2009年起被確認為全世界最高的人,被列入金氏世界紀錄大全,其雙手和腳掌亦打破金氏世界紀錄大全,腳掌長達40公分。2009年時,科塞高247公分,到了2012年,他高了4公分,達到251公分。\n","\n","i:2\n","瑪麗蓮·沃斯·莎凡特\n","瑪麗蓮·沃斯·莎凡特(Marilyn vos Savant,)曾經被記載為吉尼斯世界記錄所認定擁有最高智商的人類及女性 (1984 to 1989)。她於1946年出生於美國密苏里州的圣路易斯,瑪麗蓮在剛滿10歲的1956年9月時初次接受史丹福-比奈智力測驗 (心智年齡比例智商),測得智商高達228,並登上世界紀錄。然而,智商的判定與比較方式後來遭到爭議, 隨後吉尼斯世界記錄在1990年移除了“智商最高的人”這個項目。\n","\n","i:3\n","艾德蒙·希拉里\n","艾德蒙·珀西瓦尔·希拉里爵士,KG,ONZ,KBE(Edmund Percival Hillary,),是紐西蘭登山家和探險家,在和雪巴人嚮導丹增·诺盖的合作之下,他和丹增·诺盖成了可證明的記錄中最早成功攀登珠穆朗瑪峰峰頂的人。\n","\n","i:4\n","艾瑪·莫拉諾\n","艾瑪·馬丁娜·露易吉亞·莫拉諾(,),生於意大利奇維亞斯科,超級人瑞,曾是世界最年長者(世界紀錄第5名)和1890年代最後1位去世的人。除此之外,她也是歐洲的第3年長者(享嵩壽117歲又137天),僅次於雅娜·卡爾曼特(享嵩壽122歲164天)和露西爾·朗東(生於1904年2月11日)。\n","\n"]}],"source":["for i, result in enumerate(results_zh):\n"," print(f\"i:{i}\")\n"," print(result.get('title'))\n"," print(result.get('text'))\n"," print()"]},{"cell_type":"markdown","id":"8f769bf4","metadata":{},"source":["我们发现这里已经获到了正确的答案:历史上最高的人是罗伯特·沃德洛。而且还查询到了其他文件。\n","\n","不过,我们仍然可以使用重排来帮助我们。\n","\n","当我们对这些结果应用重排时会发生什么呢?让我们再次调用重新排名函数,它将会给出查询文本的相关性并对结果进行重新排序。"]},{"cell_type":"code","execution_count":114,"id":"d269db28-15aa-426a-a993-14275a36ca09","metadata":{},"outputs":[],"source":["texts = [result.get('text') for result in results]\n","reranked_text = rerank_responses(query_2, texts)"]},{"cell_type":"code","execution_count":115,"id":"aa7aca9b-bdc0-4c08-9615-1a7408854cb4","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","RerankResult\n","\n","i:1\n","RerankResult\n","\n","i:2\n","RerankResult\n","\n","i:3\n","RerankResult\n","\n","i:4\n","RerankResult\n","\n"]}],"source":["for i, rerank_result in enumerate(reranked_text):\n"," print(f\"i:{i}\")\n"," print(f\"{rerank_result}\")\n"," print()"]},{"cell_type":"code","execution_count":121,"id":"b1a4efdd","metadata":{},"outputs":[],"source":["results_zh = dense_retrieval(query_2_zh, client, results_lang=\"zh\")"]},{"cell_type":"code","execution_count":123,"id":"0c0b6d83","metadata":{},"outputs":[],"source":["texts_zh = [result.get('text') for result in results_zh]\n","reranked_text_zh = rerank_responses_zh(query_2_zh, texts_zh)"]},{"cell_type":"code","execution_count":124,"id":"0ae6e68a","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["i:0\n","RerankResult\n","\n","i:1\n","RerankResult\n","\n","i:2\n","RerankResult\n","\n","i:3\n","RerankResult\n","\n","i:4\n","RerankResult\n","\n"]}],"source":["for i, rerank_result in enumerate(reranked_text_zh):\n"," print(f\"i:{i}\")\n"," print(f\"{rerank_result}\")\n"," print()"]},{"cell_type":"markdown","id":"d539546c","metadata":{},"source":["我们发现重排得到的结果中确实是与罗伯特·沃德洛是相关性最高的那个,为 0.97。对于其他文章,它给出的相关性并不高。\n","- PS: 中文支持确实不太行"]},{"cell_type":"markdown","id":"554b9838","metadata":{},"source":["重新排名帮助我们确定在稠密检索出现的答案中哪一个才是正确的答案。现在,我们鼓励您在这里暂停,尝试自己的例子。用自己的查询,找到搜索结果,然后使用重排找到正确的答案。\n"]},{"cell_type":"markdown","id":"4479827b","metadata":{},"source":["### 4.2 搜索系统的评估"]},{"cell_type":"markdown","id":"ac052b00","metadata":{},"source":["\n","![](images/5-4.png)\n","\n","\n","既然我们有了所有这些搜索系统,您可能想知道如何评估它们。有多种[评估方法](https://zhuanlan.zhihu.com/p/351986117),如:\n","\n","- 平均精度(MAP)\n","- 平均倒数排名(MRR)\n","- 归一化折减累积增益(NDCG)\n"," \n","那么,如何创建一个测试集来评估这些模型呢?一个优质的测试集应该包含查询和正确的响应。然后,您可以将这些正确的响应与模型给出的响应进行比较,就像评估分类模型的准确性、精确度或召回率一样。如果您想了解更多关于评估搜索系统的信息,我们将会在资源中提供一些文章链接供您参考。\n","\n","现在,您已经学会使用搜索和重排来检索包含特定问题答案的文档。在下一课中,您将学习一些更有趣的内容。您将学习如何结合搜索系统和生成模型,以便以人类的方式输出查询的答案。"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.6"}},"nbformat":4,"nbformat_minor":5}