Versions (relevant - OpenSearch/Dashboard/Server OS/Browser): 2.18.0/2.18.0/Ubuntu 22.04
Describe the issue:
I went through the documentation for hybrid search and set up a search pipeline at the cluster level. I’m trying to get hybrid search to work but am facing some difficulty.
The problem is that my original OpenSearch logic was using a Boolean clause for filtering. When using hybrid search, I can’t seem to use that because the Boolean clause doesn’t allow any other fields under it and the hybrid clause as well.
This is my mapping:
{
"hybrid-search-test-index": {
"aliases": {},
"mappings": {
"dynamic": "strict",
"properties": {
"channel_id": {
"type": "keyword"
},
"id": {
"type": "keyword"
},
"lang": {
"type": "keyword"
},
"metadata": {
"dynamic": "true",
"properties": {
"article_id": {
"type": "keyword"
},
"faq_id": {
"type": "keyword"
},
"generation_text": {
"type": "keyword"
},
"image_chunk_ids": {
"type": "keyword"
},
"image_hash": {
"type": "keyword"
},
"image_url": {
"type": "keyword"
},
"revision_id": {
"type": "keyword"
},
"screenshot": {
"type": "boolean"
},
"space_id": {
"type": "keyword"
},
"text_chunk_ids": {
"type": "keyword"
},
"text_hash": {
"type": "keyword"
}
}
},
"texts": {
"properties": {
"en": {
"type": "text",
"analyzer": "en_analyzer"
},
"etc": {
"type": "text",
"analyzer": "en_analyzer"
},
"ja": {
"type": "text",
"analyzer": "ja_analyzer"
},
"ko": {
"type": "text",
"analyzer": "ko_analyzer"
},
"subtype": {
"type": "keyword"
}
}
},
"type": {
"type": "keyword"
},
"vectors": {
"type": "nested",
"properties": {
"model_name": {
"type": "keyword"
},
"model_version": {
"type": "keyword"
},
"subtype": {
"type": "keyword"
},
"values": {
"type": "knn_vector",
"dimension": 1024,
"method": {
"engine": "lucene",
"space_type": "cosinesimil",
"name": "hnsw",
"parameters": {}
}
}
}
}
}
},
"settings": {
"index": {
"replication": {
"type": "DOCUMENT"
},
"blocks": {
"write": "true"
},
"provided_name": "hybrid-search-test-index",
"knn": "true",
"creation_date": "1742202362678",
"analysis": {
"analyzer": {
"ja_analyzer": {
"filter": [
"kuromoji_baseform",
"kuromoji_part_of_speech",
"kuromoji_readingform",
"kuromoji_stemmer",
"lowercase"
],
"type": "custom",
"tokenizer": "kuromoji_tokenizer"
},
"ko_analyzer": {
"filter": [
"nori_readingform",
"lowercase",
"nori_part_of_speech"
],
"type": "custom",
"tokenizer": "nori_tokenizer"
},
"en_analyzer": {
"type": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "ypIc3md0T8esjj6clYUnQw",
"version": {
"created": "136397827",
"upgraded": "136397827"
},
"routing": {
"allocation": {
"initial_recovery": {
"_id": null
}
}
},
"number_of_shards": "1",
"routing_partition_size": "1",
"resize": {
"source": {
"name": "table-rag-test-bullsone",
"uuid": "D_3bxlrIQfOsTACAXmrzzg"
}
}
}
}
}
}
I don’t think my index is particularly complicated. Here’s an example of the query I’m using:
POST /hybrid-search-test-index/_search?search_pipeline=nlp-search-pipeline
{
"size": 10,
"_source": {
"exclude": ["vectors.values"]
},
"query": {
"bool": {
"filter": {
"nested": {
"path": "vectors",
"query": {
"bool": {
"filter": [
{
"term": {
"vectors.subtype": "cell"
}
},
{
"term": {
"vectors.model_name": "embedding"
}
},
{
"term": {
"vectors.model_version": "240718"
}
}
]
}
}
}
}
},
"hybrid": {
"queries": [
{
"match": {
"texts.ko": {
"query": "체어",
"boost": 1.0
}
}
},
{
"nested": {
"path": "vectors",
"query": {
"knn": {
"vectors.values": {
"vector": [
1
],
"k": 10,
"boost": 1.0
}
}
}
}
}
]
}
}
}
I need to be able to perform filtering because we’re using one index for various clients, and we don’t want to be searching another client’s documents for one client (the channel_id
in the metadata refers to each client ID).
Is it possible to use hybrid search with filters?