Hi,
I have a document indexed as the following on Amazon OpenSearch:
{
"title": "TST-TestAddCourseGLOBAL",
"titlesort": "TST-TESTADDCOURSEGLOBAL",
"description": "€£$@§#",
"code": "TST-TEST"
}
I have the following query:
{
"query": {
"bool": {
"must": [
{ "match": { "_index": "138d7b27-4429-4ba7-bd7a-7a0a23d5f03a" }},
{ "match": { "lms_document_type": "EsAgentCourse"}},
{ "match": { "description": "€£$@§#" }}
]
}
}
}
I cannot find any results using those symbols as keyword. Instead, if I change the course’s description as “解毒熟語”, I can always find the document, even if I put just one kanji character. The index was created using the following mapping through the PHP client (“elasticsearch/elasticsearch”: “7.7.0”):
private static function getTextFieldMapping($keywordAutocomplete = false, $sortable = false) {
$fields = array(
'length' => array(
'type' => 'token_count',
'analyzer' => 'standard'
),
'autocomplete' => array(
'type' => 'text',
'analyzer' => 'autocomplete'
)
);
if ($keywordAutocomplete) {
$fields['autocomplete_keyword'] = array(
'type' => 'text',
'analyzer' => 'autocomplete_keyword'
);
}
if ($sortable) {
$fields['sort'] = array(
'type' => 'text',
'analyzer' => 'keyword_lowercase',
'fielddata' => true,
);
}
return array(
'type' => 'text',
'fields' => $fields
);
}
public static function getMapping($index_name, $shards, $replica) {
return [
'index' => $index_name,
'body' => [
'settings' => [
'number_of_shards' => $shards,
'number_of_replicas' => $replica,
'index' => [
'max_terms_count' => 120000,
'highlight' => [
'max_analyzed_offset' => 2000000
]
],
'analysis' => [
'filter' => [
'autocomplete_filter' => [
'type' => 'edge_ngram',
'min_gram' => 2,
'max_gram' => 40,
],
'autocomplete_filter_min_3' => [
'type' => 'edge_ngram',
'min_gram' => 3,
'max_gram' => 40,
],
],
'char_filter' => [
'char_substitutions' => [
'type' => 'mapping',
'mappings' => [
"\u0091=>\u0020",
"\u0092=>\u0020",
"\u2018=>\u0020",
"\u2019=>\u0020",
"\u201B=>\u0020",
"'=>\u0020",
// Char substitution to avoid token split on '-'
// as per https://docebo.atlassian.net/browse/DOC-13620
'-=>_',
],
],
],
'analyzer' => [
'autocomplete' => [
'type' => 'custom',
'tokenizer' => 'standard',
'filter' => [
'lowercase',
'autocomplete_filter',
],
'char_filter' => [
'char_substitutions',
'html_strip',
],
],
'autocomplete_min_3' => [
'type' => 'custom',
'tokenizer' => 'standard',
'filter' => [
'lowercase',
'autocomplete_filter_min_3',
],
'char_filter' => [
'char_substitutions',
'html_strip',
],
],
// Required for 'tags' field
'autocomplete_keyword' => [
'type' => 'custom',
'tokenizer' => 'keyword',
'filter' => [
'lowercase',
'autocomplete_filter',
],
],
// Used for sortable text fields
'keyword_lowercase' => [
'type' => 'custom',
'tokenizer' => 'keyword',
'filter' => [
'lowercase'
],
]
],
],
],
'mappings' => [
'properties' => [
# Generic fields (more then one type)
'lms_document_type' => ['type' => 'keyword',], # course, asset, ...
'lms_doc_id' => ['type' => 'keyword',],
'lms_id' => ['type' => 'integer',],
'lms_domain' => ['type' => 'keyword',],
'title' => self::getTextFieldMapping(false, true),
'description' => self::getTextFieldMapping(),
'short_description' => self::getTextFieldMapping(),
'additional_fields' => self::getTextFieldMapping(),
'transcript' => self::getTextFieldMapping(),
'tags' => self::getTextFieldMapping(true, false),
'skillNames' => self::getTextFieldMapping(true, false),
# TODO remove titlesort and use the sort subfield on the title when the ES 2 compatibility will be removed
'titlesort' => ['type' => 'keyword',],
'created' => ['type' => 'integer',],
'createdString' => ['type' => 'text',],
'channels' => self::getTextFieldMapping(),
'channelIds' => ['type' => 'keyword',],
'duration' => ['type' => 'integer',],
'rating' => ['type' => 'double',],
'show_rules' => ['type' => 'text',],
'status' => ['type' => 'long',],
'isRelatedToDisabledChannels' => ['type' => 'long'],
# EsAgentKnowledgeAsset fields
'content_type' => ['type' => 'integer',],
'visibleToAll' => ['type' => 'long'],
'is_private' => ['type' => 'keyword',],
'conversion_status' => ['type' => 'keyword',],
'id_creator' => ['type' => 'keyword',],
'datePublished' => ['type' => 'text',],
'views' => ['type' => 'long',],
# EsAgentCourse fields
'course_type' => ['type' => 'keyword',],
'uidCourse' => self::getTextFieldMapping(),
'language' => ['type' => 'keyword',],
'lang_code' => ['type' => 'keyword',],
'is_paid' => ['type' => 'long',],
'price' => ['type' => 'double'], # also used in EsAgentPlan
// The next 2 ES course DOCUMENT attributes depict the 'can_subscribe' Course attribute
// 0 - 0 : can_subscribe=0 [closed]
// 0 - 9999999999999 : can_subscribe=1 [open]
// X - Y : can_subscribe=2 && start=X, end=Y
// Then when we do the search we just compare the current timestap with these 2 DOCUMENT attributes
// and if the current time is between them, course 'can_subscribe' settings is resolved
'sub_start_timestamp' => ['type' => 'long'], # also used in EsAgentLo
'sub_end_timestamp' => ['type' => 'long',], # also used in EsAgentLo
# EsAgentLo fields
'object_type' => ['type' => 'integer',],
'visible' => ['type' => 'keyword',],
'can_subscribe' => ['type' => 'keyword',],
'idCourse' => ['type' => 'keyword',],
# EsAgentQandA fields
'question_type' => ['type' => 'integer',],
'answerIds' => ['type' => 'keyword',],
'answers' => ['type' => 'text',],
'idUser' => ['type' => 'keyword',],
'open' => ['type' => 'keyword',],
'idContent' => ['type' => 'keyword',],
'idLearningObject' => ['type' => 'keyword',],
'question_request' => ['type' => 'text',],
'is_related' => ['type' => 'long',],
'has_best_answer' => ['type' => 'long',],
'meta_field_question_type' => ['type' => 'keyword',],
# EsAgentPlan fields
'path_code' => ['type' => 'keyword',],
'visible_in_catalog' => ['type' => 'keyword',],
]
],
]
];
}
I cannot spot what could be the issue since the filters shouldn’t apply to symbols. Maybe if they’re all symbols, they’re not recognized as a gram? What could be the configuration to allow a research to find these types of docs?