|
1 | 1 | { |
2 | 2 | "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Lost In Translation? Multilingual Embedding Models Are All You Need*\n", |
| 8 | + "\n", |
| 9 | + "This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)." |
| 10 | + ] |
| 11 | + }, |
3 | 12 | { |
4 | 13 | "cell_type": "code", |
5 | 14 | "execution_count": null, |
|
40 | 49 | " print(f\"Data successfully downloaded and saved to {output_file}\")\n", |
41 | 50 | "else:\n", |
42 | 51 | " print(f\"Failed to download data: {response.status_code}\")\n", |
43 | | - " print(response.text)\n" |
| 52 | + " print(response.text)" |
44 | 53 | ] |
45 | 54 | }, |
46 | 55 | { |
|
54 | 63 | "# Get credentials securely for localhost Elasticsearch\n", |
55 | 64 | "print(\"Enter your Elasticsearch credentials:\")\n", |
56 | 65 | "cloud_id = input(\"Enter your cloud_id: \")\n", |
57 | | - "api_key = getpass(\"Enter your api_key: \")\n" |
| 66 | + "api_key = getpass(\"Enter your api_key: \")" |
58 | 67 | ] |
59 | 68 | }, |
60 | 69 | { |
|
82 | 91 | ], |
83 | 92 | "source": [ |
84 | 93 | "from elasticsearch import Elasticsearch\n", |
| 94 | + "\n", |
85 | 95 | "try:\n", |
86 | 96 | " es = Elasticsearch(\n", |
87 | 97 | " hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n", |
88 | 98 | " basic_auth=(\"elastic\", \"qaf_admin\"),\n", |
89 | 99 | " verify_certs=False, # Set to True if you have valid SSL certificates\n", |
90 | 100 | " # Alternatively, you can use Elastic cloud_id and api_key\n", |
91 | | - " #api_key=getpass(\"API Key: \")\n", |
92 | | - " #cloud_id=getpass(\"Cloud ID: \"),\n", |
| 101 | + " # api_key=getpass(\"API Key: \")\n", |
| 102 | + " # cloud_id=getpass(\"Cloud ID: \"),\n", |
93 | 103 | " )\n", |
94 | 104 | "\n", |
95 | 105 | " # Test the connection\n", |
|
101 | 111 | "except Exception as e:\n", |
102 | 112 | " print(f\"Error connecting to Elasticsearch: {e}\")\n", |
103 | 113 | " print(\"Please check your credentials\")\n", |
104 | | - " raise\n" |
| 114 | + " raise" |
105 | 115 | ] |
106 | 116 | }, |
107 | 117 | { |
|
147 | 157 | " es.indices.create(index=index_name, body=mapping)\n", |
148 | 158 | "\n", |
149 | 159 | "# Load the JSON data\n", |
150 | | - "with open('./multilingual_coco_sample.json', 'r') as f:\n", |
| 160 | + "with open(\"./multilingual_coco_sample.json\", \"r\") as f:\n", |
151 | 161 | " data = json.load(f)\n", |
152 | 162 | "\n", |
153 | 163 | "rows = data[\"rows\"]\n", |
|
175 | 185 | " if description == \"\":\n", |
176 | 186 | " continue\n", |
177 | 187 | " # Add index operation\n", |
| 188 | + " bulk_data.append({\"index\": {\"_index\": index_name}})\n", |
| 189 | + " # Add document\n", |
178 | 190 | " bulk_data.append(\n", |
179 | | - " {\"index\": {\"_index\": index_name}}\n", |
| 191 | + " {\n", |
| 192 | + " \"language\": lang,\n", |
| 193 | + " \"description\": description,\n", |
| 194 | + " \"en\": first_eng_caption,\n", |
| 195 | + " \"image_url\": image_url,\n", |
| 196 | + " }\n", |
180 | 197 | " )\n", |
181 | | - " # Add document\n", |
182 | | - " bulk_data.append({\n", |
183 | | - " \"language\": lang,\n", |
184 | | - " \"description\": description,\n", |
185 | | - " \"en\": first_eng_caption,\n", |
186 | | - " \"image_url\": image_url,\n", |
187 | | - " })\n", |
188 | 198 | "\n", |
189 | 199 | "# Perform bulk indexing\n", |
190 | 200 | "if bulk_data:\n", |
|
0 commit comments