Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):
OpenSearch : 2.16.0.0
Describe the issue:
I am trying to create a Neural Search Pipeline so that I can perform Hybrid Search. I want to use an Ollama llm which is deployed on my machine. I performed the following steps:
Step 1. Register model group:
localhost:9200/_plugins/_ml/model_groups/_register
{
"name": "NLP_model_group",
"description": "A model group for NLP models",
"access_mode": "public"
}
Step 2. Register URL:
{
"persistent": {
"plugins.ml_commons.trusted_connector_endpoints_regex": [
"^<Ollama url>/.*$"
]
}
}
Step 3. Access Ctrl:
{
"persistent": {
"plugins.ml_commons.connector_access_control_enabled": true
}
}
Step 4. Remote Model Connector:
{
"name": "OpenAI Chat Connector",
"description": "The connector to a remote Ollama model",
"version": 1,
"protocol": "http",
"parameters": {
"endpoint": "<Ollama URL>",
"model": "mistral"
},
"actions": [
{
"action_type": "predict",
"method": "POST",
"url": "http://${parameters.endpoint}/api/embeddings",
"request_body": "{ \"model\": \"${parameters.model}\", \"prompt\": \"${parameters.text}\"}",
"pre_process_function": "\n StringBuilder builder = new StringBuilder();\n builder.append(\"\\\"\");\n String first = params.text_docs[0];\n builder.append(first);\n builder.append(\"\\\"\");\n def parameters = \"{\" +\"\\\"text\\\":\" + builder + \"}\";\n return \"{\" +\"\\\"parameters\\\":\" + parameters + \"}\";",
"post_process_function": "\n def name = \"embedding\";\n def dataType = \"FLOAT32\";\n if (params.embedding == null || params.embedding.length == 0) {\n return null;\n }\n def shape = [params.embedding.length];\n def json = \"{\" +\n \"\\\"name\\\":\\\"\" + name + \"\\\",\" +\n \"\\\"data_type\\\":\\\"\" + dataType + \"\\\",\" +\n \"\\\"shape\\\":\" + shape + \",\" +\n \"\\\"data\\\":\" + params.embedding +\n \"}\";\n return json;\n "
}
]
}
Step 5. Register model to the model group:
{
"name": "Ollama model",
"function_name": "remote",
"model_group_id": "<group id>",
"description": "test model",
"connector_id": "<connector id>"
}
Step 6. Deploy the model:
localhost:9200/_plugins/_ml/models/model id/_deploy
Step 7. Create NLP Pipeline:
localhost:9200/_ingest/pipeline/nlp-ingest-pipeline
{
"description": "An new NLP ingest pipeline",
"processors": [
{
"text_embedding": {
"model_id": "<model id>",
"field_map": {
"text": "embedding"
}
}
}
]
}
Step 8. Create NLP Index:
localhost:9200/<index-name>
{
"settings": {
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"properties": {
"id": {
"type": "text"
},
"embedding": {
"type": "knn_vector",
"dimension": 4096,
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
},
"text": {
"type": "text"
}
}
}
}
Step 9. Create Post processor pipeline:
localhost:9200/_search/pipeline/nlp-ingest-pipeline
{
"description": "Post processor for hybrid search",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "arithmetic_mean",
"parameters": {
"weights": [
0.3,
0.7
]
}
}
}
}
]
}
Step 10. Ingest Data:
localhost:9200/<index-name>/_doc/1
{
"text": "some text ...",
"id": "1"
}
localhost:9200/<index-name>/_doc/2
{
"text": "some text ...",
"id": "2"
}
Step 11: Hybrid Search query:
localhost:9200/<index-name>/_search?search_pipeline=nlp-search-pipeline
{
"_source": {
"exclude": [
"embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"text": {
"query": "some text"
}
}
},
{
"neural": {
"embedding": {
"query_text": "some text",
"model_id": "<model_id>",
"k": 5
}
}
}
]
}
}
}
I was able to resolve all the issue and save text as well as embeddings as document in the index. However when I search I get 2 results with best match as second one instead of first. How can I improve on the results to get the best match as the first record?