Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):
OpenSearch 2.11
Describe the issue:
I’m trying to load a parquet using OpenSearch Ingestion Pipelines but I’m running into an issue when trying to load a knn_vector. My data in the parquet is stored as a list of floats but when trying to ingest the document it triggers this error:
{"pluginId":"opensearch","pluginName":"opensearch","pipelineName":"log-pipeline","failedData":
{"index":"testing_os_pipeline_index","indexId":null,"status":400,
"message":"failed to parse field [node_name_vector] of type [knn_vector] in document with id '_puDDY0BaCZKHEc2_Crj'.
Preview of field's value: '{element=0.009439812041819096}' caused by Current token (START_OBJECT) not numeric, can not use numeric value accessors
\n at [Source: (byte[])\"{\"node_id\":\"lvc-p-3588f59718913a00cab2ff53bc87e2b0\",\"node_name\":\" \",
\"node_name_vector\":[{\"element\":0.009439812041819096},{\"element\":-0.00675856601446867},{\"element\":0.03931649401783943},{\"element\":-0.02302893064916134},{\"element\":0.0018562552286311984},{\"element\":0.05322980135679245},{\"element\":0.06775651127099991},{\"element\":0.049850285053253174},{\"element\":-0.01152312383055687},
{\"element\":-0.03115047886967659},{\"element\":0.005154617130756378},{\"element\":-0.07197585701942444},{\"element\":0.01560\"[truncated 12388 bytes]; line: 1,
column: 90]","document":{"node_id":"lvc-p-3588f59718913a00cab2ff53bc87e2b0","node_name":" ","node_name_vector":[{"element":0.009439812041819096},{"element":-0.00675856601446867},{"element":0.03931649401783943},{"element":-0.02302893064916134},{"element":0.0018562552286311984},{"element":0.05322980135679245},
{"element":0.06775651127099991},{"element":0.049850285053253174},{"element":-0.01152312383055687},{"element":-0.03115047886967659},{"element":0.005154617130756378},{"element":-0.07197585701942444},{"element":0.015602701343595982},{"element":0.041258830577135086},{"element":0.010276184417307377},{"element":-0.01007060706615448},
{"element":-0.017029765993356705},{"element":0.042910680174827576},{"element":-0.03600999340415001},{"element":0.010554338805377483},{"element":0.029608823359012604},{"element":-0.022530287504196167},{"element":-0.021999403834342957},{"element":-0.03891109302639961},{"element":0.03567637503147125} [...]],"s3":
{"bucket":"xxxx","key":"test_data/list_vector.parquet"},"@timestamp":"2024-01-
15T14:25:02.408Z"}},"timestamp":"2024-01-15T14:26:10.475Z"}
Configuration:
Here’s my pipeline configuration:
version: "2"
log-pipeline:
source:
s3:
codec:
parquet:
schema: |
{
"namespace": "org.example.test",
"type": "record",
"name": "TestMessage",
"fields": [
{"name": "node_id", "type": "string"},
{"name": "node_name", "type": "string"}
{"name": "node_name_vector", "type": {"type": "array", "items": "float"},
]
}
compression: "none"
aws:
region: "us-east-2"
sts_role_arn: "xxxx"
acknowledgments: true
scan:
scheduling:
interval: PT30S
buckets:
- bucket:
name: xxxx
filter:
include_prefix:
- test_data/
delete_s3_objects_on_read: false
processor:
- date:
destination: "@timestamp"
from_time_received: true
sink:
- opensearch:
hosts: ["xxxx"]
index: "testing_os_pipeline_index"
aws:
sts_role_arn: "xxxx"
dlq:
s3:
bucket: "xxxx"
region: "us-east-2"
key_path_prefix: "log-pipeline/dlq"
sts_role_arn: "xxxx"
Relevant Logs or Screenshots:
Here’s how a row of the vector data looks like: