**Describe the bug**
When performing Knn search queries on an index multiple ti…mes, with documents being deleted and inserted, the search occasionally does not return any hits.
**To Reproduce**
1. Create a Knn index.
2. Generate a vector to be used during tests.
3. Add a document with the vector and refresh the index.
4. Search for that vector and retrieve the document ID.
5. Delete the document with the retrieved ID.
6. Repeat steps 3-5 until the search returns no hits.
**Expected behavior**
The search query should consistently return hits as long as there are documents in the index.
**Plugins**
Please list all plugins currently enabled.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Host/Environment (please complete the following information):**
- Operating System: M1 Mac (Also occurred on Linux/ARM64)
- OpenSearch Version: 2.7.0/2.9.0
**Additional context**
Python script that reproduces issue:
```import random
import requests
import json
from string import ascii_lowercase
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# OpenSearch cluster configuration
opensearch_host = 'localhost'
opensearch_port = 9200
opensearch_index = ''.join([random.choice(ascii_lowercase) for _ in range(10)])
opensearch_url = f'https://{opensearch_host}:{opensearch_port}'
# Replace 'admin' and 'admin' with your actual username and password
username = 'admin'
password = 'admin'
vec = [random.random() for _ in range(384)]
# Function to add a document with random data
def add_document(index_name):
document = {
'__chunks': {
'__field_name': f'field_{random.randint(1, 100)}',
'__field_content': f'content_{random.randint(1, 100)}',
'__vector_marqo_knn_field': vec
}
}
url = f"{opensearch_url}/{index_name}/_doc"
headers = {'Content-Type': 'application/json'}
response = requests.post(url, json=document, headers=headers, auth=(username, password), verify=False)
response_data = response.json()
refresh_index(index_name)
return response_data['_id']
def refresh_index(index_name):
url = f"{opensearch_url}/{index_name}/_refresh"
response = requests.post(url, auth=(username, password), verify=False)
if response.status_code == 200:
print(f"Index '{index_name}' refreshed.")
else:
print(f"Failed to refresh index '{index_name}'.")
# Function to search for documents using the provided KNN query
def search_documents(index_name, knn_query):
url = f"{opensearch_url}/{index_name}/_search"
headers = {'Content-Type': 'application/json'}
search_body = {
"size": 100,
"from": 0,
"_source": { # Exclude the vector field from the snippet
"exclude": ["__chunks.__vector_marqo_knn_field"]
},
"query": {
"nested": {
"path": "__chunks",
"inner_hits": {
"_source": {
"include": ["__chunks.__field_content", "__chunks.__field_name"]
}
},
"query": knn_query
}
}
}
response = requests.post(url, json=search_body, headers=headers, auth=(username, password), verify=False)
response_data = response.json()
# Extract document IDs from the search results
doc_ids = [hit['_id'] for hit in response_data['hits']['hits']]
return doc_ids
# Function to delete documents by their IDs
def delete_documents_by_ids(index_name, doc_ids):
for doc_id in doc_ids:
url = f"{opensearch_url}/{index_name}/_doc/{doc_id}"
headers = {'Content-Type': 'application/json'}
response = requests.delete(url, headers=headers, auth=(username, password), verify=False)
if response.status_code == 200:
print(f"Deleted document with ID: {doc_id}")
else:
print(f"Failed to delete document with ID: {doc_id}")
refresh_index(index_name)
# Create the index with the specified mapping
index_mapping = {
"settings": {
"index": {
"knn": True,
"knn.algo_param.ef_search": 100,
"refresh_interval": "1s",
"store.hybrid.mmap.extensions": [
"nvd", "dvd", "tim", "tip", "dim", "kdd", "kdi", "cfs", "doc", "vec", "vex"
]
},
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"_meta": {
"media_type": "text",
"index_settings": {
"index_defaults": {
"treat_urls_and_pointers_as_images": False,
"model": "hf/all_datasets_v4_MiniLM-L6",
"normalize_embeddings": True,
"text_preprocessing": {
"split_length": 2,
"split_overlap": 0,
"split_method": "sentence"
},
"image_preprocessing": {
"patch_method": None
},
"ann_parameters": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "lucene",
"parameters": {
"ef_construction": 128,
"m": 16
}
}
},
"number_of_shards": 1,
"number_of_replicas": 0
},
"model": "hf/all_datasets_v4_MiniLM-L6"
},
"dynamic_templates": [
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "text"
}
}
}
],
"properties": {
"__chunks": {
"type": "nested",
"properties": {
"__field_name": {
"type": "keyword"
},
"__field_content": {
"type": "text"
},
"__vector_marqo_knn_field": {
"type": "knn_vector",
"dimension": 384,
"method": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "lucene",
"parameters": {
"ef_construction": 128,
"m": 16
}
}
}
}
}
}
}
}
try:
url = f"{opensearch_url}/{opensearch_index}"
headers = {'Content-Type': 'application/json'}
response = requests.put(url, json=index_mapping, headers=headers, auth=(username, password), verify=False)
if response.status_code == 200:
print(f"Created index: {opensearch_index}")
else:
print(f"Failed to create index: {opensearch_index}")
except Exception as e:
print(f"An error occurred: {e}")
# Main loop
iterations = 1000000 # Set the number of iterations
for i in range(iterations):
print(f"Iteration {i + 1}:")
# Add a random document
doc_id = add_document(opensearch_index)
print(f"Added document with ID: {doc_id}")
# Perform a KNN search
knn_query = {
"knn": {
"__chunks.__vector_marqo_knn_field": {
"vector": vec,
"k": 100
}
}
}
search_results = search_documents(opensearch_index, knn_query)
print(f"Found {len(search_results)} documents matching the KNN query")
if search_results:
# Delete the found documents
print(search_results)
delete_documents_by_ids(opensearch_index, search_results)
else:
print("No documents found")
break # Exit the loop if no documents are found
to_delete_index = input("Delete the index? (y/n): ")
if to_delete_index:
if to_delete_index.lower() == "y":
url = f"{opensearch_url}/{opensearch_index}"
headers = {'Content-Type': 'application/json'}
response = requests.delete(url, headers=headers, auth=(username, password), verify=False)
if response.status_code == 200:
print(f"Deleted index: {opensearch_index}")
else:
print(f"Failed to delete index: {opensearch_index}")
print("Script completed.")
```