Exact search getting less precedence than phonetic search? - elasticsearch-5

I have an elasticsearch index and am using the following query:
"_source": [
"title",
"content"
],
"size": 15,
"from": 0,
"query": {
"bool": {
"must": {
"multi_match": {
"query": "{{query}}",
"fields": [
"title",
"content"
],
"operator": "or"
}
},
"should": [
{
"multi_match": {
"query": "{{query}}",
"fields": [
"title.standard^16",
"content.standard^2"
],
"operator": "and"
}
},
{
"match_phrase": {
"content.standard": {
"query": "{{query}}",
"_name": "Phrase on title",
"boost": 1000
}
}
}
]
}
},
"highlight": {
"fields": {
"content": {}
},
"fragment_size": 100
}
}
Here is the mapping I set:
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_metaphone"
]
}
},
"filter": {
"my_metaphone": {
"type": "phonetic",
"encoder": "metaphone",
"replace": true
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "my_analyzer",
"fields": {
"standard": {
"type": "text"
},
"stemmer": {
"type": "text",
"analyzer": "english"
}
}
},
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "my_analyzer",
"fields": {
"standard": {
"type": "text"
},
"stemmer": {
"type": "text",
"analyzer": "english"
}
}
}
}
}
}
Here is my logic with the query:
1) It will give the highest precedence to a phrase if it appears.
2) If not it will use the standard analyzer (that is the text, as is) and give it the highest precedence.
3) If all else doesn't match up, it will use the phonetic analyzer to get the results, that is the least precedence.
But obviously there is some fault to this as it seems to give higher precedence to the phonetic analyzer than the standard or phrase. For example, if I search for "Person of Indian Origin" it returns results on the top highlighting "Pursuant" "pursuing" and very, very less number of results with person of Indian origin although I know a large number of them exists. How do I solve this?

Related

Problem with Elasticsearch querying nested documents

I am learning ES and I am having problems with this query:
Given 2 products:
products/_source/1
{
"product_id": "58410-2",
"name": [
{
"locale": "en",
"translation": "CBC panel"
},
{
"locale": "vn",
"translation": "CBC panel VN"
}
],
"status": "active",
"category": {
"id": 8,
"name": [
{
"locale": "en",
"translation": "Hematology"
},
{
"locale": "vn",
"translation": "huyết học"
}
]
},
"children": [
{
"product_id": "6690-2",
"name": [
{
"locale": "en",
"translation": "Leukocytes"
},
{
"locale": "vn",
"translation": "Leukocytes vn"
}
],
"status": "active",
"category": {
"id": 8,
"name": [
{
"locale": "en",
"translation": "Hematology"
},
{
"locale": "vn",
"translation": "huyết học"
}
]
},
"children": []
}]}
and
products/_source/2
{
"product_id": "6690-2",
"name": [
{
"locale": "en",
"translation": "Leukocytes"
},
{
"locale": "vn",
"translation": "Leukocytes vn"
}
],
"status": "active",
"category": {
"id": 8,
"name": [
{
"locale": "en",
"translation": "Hematology"
},
{
"locale": "vn",
"translation": "huyết học"
}
]
},
"children": []
}
where a product is a single document but also can be nested in a children array of other products. Both products are different documents in the index.
and this index:
{
"products": {
"aliases": {},
"mappings": {
"dynamic": "false",
"properties": {
"category": {
"properties": {
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
}
}
},
"children": {
"type": "nested"
},
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
},
"product_id": {
"type": "keyword"
},
"status": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"routing": {
"allocation": {
"include": {
"_tier_preference": "data_content"
}
}
},
"number_of_shards": "3",
"provided_name": "products",
"number_of_replicas": "1"
}
}
}
}
I want to be able to query for "Leuko" (or the category or the product_id) and retrieve both products, the single product and the root product.
I have tried using object field, nested, flattened but I think the problem is I don't know how to properly write the query, I have tried things like this (I am using a ruby library but I think it is easy to follow):
#query = {
query: {
query_string: {
fields: ['name.translation', 'children.name.translation', 'category.name.translation', 'children.product_id'],
query: "*#{text}*"
}
},
size: 50
}
#query = {
query: {
nested: {
path: 'children',
query: {
bool: {
should: [
term: { 'children.name.translation' => "*#{text}*" },
term: { 'name.translation' => "*#{text}*" }
]
}
}
}
}
}
but I think at some point I dunno what I am doing anymore and I am just randomly trying different stuff from the documentation.
Follow my query suggestion. Note that I had to add the fields in the Nested object to the mapping.
Mapping:
{
"mappings": {
"dynamic": "false",
"properties": {
"category": {
"properties": {
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
}
}
},
"children": {
"type": "nested",
"properties": {
"product_id": {
"type": "keyword"
},
"category": {
"properties": {
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
}
}
},
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
},
"status": {
"type": "keyword"
}
}
},
"name": {
"properties": {
"locale": {
"type": "keyword"
},
"translation": {
"type": "text"
}
}
},
"product_id": {
"type": "keyword"
},
"status": {
"type": "keyword"
}
}
}
}
Query:
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"nested": {
"path": "children",
"query": {
"wildcard": {
"children.name.translation": "leuko*"
}
}
}
},
{
"wildcard": {
"name.translation": "leuko*"
}
}
]
}
}
}
hint
See that you use translation. Avoid using array to make your queries simpler.
What I would do in your case is to create a field for each language, this makes the use of analyzer more flexible for each type of language and you stop using an array and work with an object.
PUT test
{
"mappings": {
"properties": {
"name":{
"type": "text",
"fields": {
"es":{
"type": "text",
"analyzer":"english"
},
"vn":{
"type": "text"
}
}
}
}
}
}
POST test/_doc/
{
"name": "Leukocytes"
}
An example query using field languages.
GET test/_search
{
"query": {
"multi_match": {
"query": "Leukocytes",
"fields": ["name.es", "name.vn"]
}
}
}

Elastic search - aggregation filter for product options

I have a products catalogue where every product is indexed as follows (queried from http://localhost:9200/products/_doc/1) as sample:
{
"_index": "products_20201202145032789",
"_type": "_doc",
"_id": "1",
"_version": 1,
"_seq_no": 0,
"_primary_term": 1,
"found": true,
"_source": {
"title": "Roncato Eglo",
"description": "Amazing LED light made of wood and description continues.",
"price": 3990,
"manufacturer": "Eglo",
"category": [
"Lights",
"Indoor lights"
],
"options": [
{
"title": "Mount type",
"value": "E27"
},
{
"title": "Number of bulps",
"value": "4"
},
{
"title": "Batteries included",
"value": "true"
},
{
"title": "Ligt temperature",
"value": "warm"
},
{
"title": "Material",
"value": "wood"
},
{
"title": "Voltage",
"value": "230"
}
]
}
}
Every option contains different value, so there are many Mount type values, Light temperature values, Material values, and so on.
How can I create an aggregation (filter) where I can let customers choose between various Mount Type options:
[ ] E27
[X] E14
[X] GU10
...
Or let them choose from different Material options displayed as checkboxes:
[X] Wood
[ ] Metal
[ ] Glass
...
I can handle it on frontend once the buckets are created. Creation of different buckets for these options is What I am struggling with.
I have succesfully created and displayed and using aggregations for Category, Manufacturer and other basic ones. Thes product options are stored in has_many_through relationships in database. I am using Rails + searchkick gem, but those allow me to create raw queries to elastic search.
The prerequisite for such aggregation is to have options field as nested.
Sample index mapping:
PUT test
{
"mappings": {
"properties": {
"title": {
"type": "keyword"
},
"options": {
"type": "nested",
"properties": {
"title": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
}
}
}
Sample docs:
PUT test/_doc/1
{
"title": "Roncato Eglo",
"options": [
{
"title": "Mount type",
"value": "E27"
},
{
"title": "Material",
"value": "wood"
}
]
}
PUT test/_doc/2
{
"title": "Eglo",
"options": [
{
"title": "Mount type",
"value": "E27"
},
{
"title": "Material",
"value": "metal"
}
]
}
Assumption: For a given document a title under option appears only once. For e.g. there can exists only one nested document under option having title as Material.
Query for aggregation:
GET test/_search
{
"size": 0,
"aggs": {
"OPTION": {
"nested": {
"path": "options"
},
"aggs": {
"TITLE": {
"terms": {
"field": "options.title",
"size": 10
},
"aggs": {
"VALUES": {
"terms": {
"field": "options.value",
"size": 10
}
}
}
}
}
}
}
}
Response:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"OPTION" : {
"doc_count" : 4,
"TITLE" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Material",
"doc_count" : 2,
"VALUES" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "metal",
"doc_count" : 1
},
{
"key" : "wood",
"doc_count" : 1
}
]
}
},
{
"key" : "Mount type",
"doc_count" : 2,
"VALUES" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "E27",
"doc_count" : 2
}
]
}
}
]
}
}
}
}

Swagger 3.0 reDoc Discriminator JSON

I'm currently writing swagger 3.0 documentation and using reDoc to render as nice UI for it. I have a few scenarios in my documentation where based on a previous properties enum I would want to display different schema object properties. Sadly I cant seam to figure out how to wire this together properly in my documentation. So far I have the following test endpoint:
{
"post": {
"operationId" : "test",
"summary": "test",
"description": "test",
"tags": [ "test" ],
"consumes": "application/json",
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "./schemas/test1.json"
},
{
"$ref": "./schemas/test2.json"
}
],
"discriminator": {
"propertyName": "pet_type",
"mapping": {
"click": "./schemas/test1.json",
"open": "./schemas/test2.json"
}
}
}
}
}
},
"responses": {
"200": {
"description": "Success"
}
}
}
}
The test1.json looks like this:
{
"Cat": {
"type": "object",
"properties": {
"pet_type": {
"type": "string"
},
"hunts": {
"type": "boolean"
},
"age": {
"type": "integer"
}
},
"discriminator": {
"propertyName": "pet_type"
}
}
}
And the test2.json like this:
{
"Dog": {
"type": "object",
"properties": {
"pet_type": {
"type": "string"
},
"bark": {
"type": "boolean"
},
"breed": {
"type": "string",
"enum": [
"Dingo",
"Husky",
"Retriever",
"Shepherd"
]
}
},
"discriminator": {
"propertyName": "pet_type"
}
}
}
The desired out come would be to toggle between the two "test" jsons based on an enum (the drop down seen in the reDoc sample). What am I missing to get this result?
You can see an example of the discriminator result here under the feature section (the first gif)
After more digging I was able to figure out the issue... my structure for the most part.
On my index.json file I updated my components section to point at my components folder containing the schema as such:
"components": {
"$ref": "./components/test.json"
},
The test.json looks like the following:
{
"schemas": {
"Refinance": {
"description": "A representation of a cat",
"allOf": [
{
"$ref": "#/schemas/Pet"
},
{
"type": "object",
"properties": {
"huntingSkill": {
"type": "string",
"description": "The measured skill for hunting",
"default": "lazy",
"enum": [
"clueless",
"lazy",
"adventurous",
"aggressive"
]
}
},
"required": [
"huntingSkill"
]
}
]
},
"Purchase": {
"description": "A representation of a dog",
"allOf": [
{
"$ref": "#/schemas/Pet"
},
{
"type": "object",
"properties": {
"packSize": {
"type": "integer",
"format": "int32",
"description": "The size of the pack the dog is from",
"default": 1,
"minimum": 1
},
"foobar": {
"type": "string",
"description": "some ol bullshit"
}
},
"required": [
"packSize"
]
}
]
},
"Pet": {
"type": "object",
"discriminator": {
"propertyName": "petType"
},
"properties": {
"petType": {
"description": "Type of a pet",
"type": "string"
}
},
"xml": {
"name": "Pet"
}
}
}
}
And finally the schema for the endpoint gets referenced as follows:
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"$ref": "../../index.json#/components/schemas/Pet"
}
}
}
},

How to make Elasticsearch sort/prefer hits with exactly matching strings first

I'm using default analyzers and indexing. So let's say I have this simple mapping:
"question": {
"properties": {
"title": {
"type": "string"
},
"answer": {
"properties": {
"text": {
"type": "string"
}
}
}
}
}
(that was an example. sorry if it has typos)
Now, I perform the following search.
GET _search
{
"query": {
"query_string": {
"query": "yes correct",
"fields": ["answer.text"]
}
}
}
The results will score a text value like "yes correct." (doc id value 1) higher than simply "yes correct" (without a period, doc id value 181). Both hits have the same score value, but the hits array lists the one with the smaller doc id first. I understand that the default index option includes sorting by doc id, so how do I exclude that one attribute and still use the rest of the default options?
I'm not setting any custom analyzers, so everything is using default values for Elasticsearch 2.0.
This is probably a use case for Dis Max Query
A query that generates the union of documents produced by its
subqueries, and that scores each document with the maximum score for
that document as produced by any subquery, plus a tie breaking
increment for any additional matching subqueries.
So following that, you need to make your answer score as an exact match and give it highest boost. You'll have to use a custom analyzer for that. That'd be your mappings:
PUT /test
{
"settings": {
"analysis": {
"analyzer": {
"my_keyword": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase"
]
}
}
}
},
"mappings": {
"question": {
"properties": {
"title": {
"type": "string"
},
"answer": {
"type": "object",
"properties": {
"text": {
"type": "string",
"analyzer": "my_keyword",
"fields": {
"stemmed": {
"type": "string",
"analyzer": "standard"
}
}
}
}
}
}
}
}
}
Your test data:
PUT /test/question/1
{
"title": "title nr1",
"answer": [
{
"text": "yes correct."
}
]
}
PUT /test/question/2
{
"title": "title nr2",
"answer": [
{
"text": "yes correct"
}
]
}
Now when you're querying for "yes correct." using such query:
POST /test/_search
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"match": {
"answer.text": {
"query": "yes correct.",
"type": "phrase"
}
}
},
{
"match": {
"answer.text.stemmed": {
"query": "yes correct.",
"operator": "and"
}
}
}
]
}
}
}
You get this output:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.37919715,
"hits": [
{
"_index": "test",
"_type": "question",
"_id": "1",
"_score": 0.37919715,
"_source": {
"title": "title nr1",
"answer": [
{
"text": "yes correct."
}
]
}
},
{
"_index": "test",
"_type": "question",
"_id": "2",
"_score": 0.11261705,
"_source": {
"title": "title nr2",
"answer": [
{
"text": "yes correct"
}
]
}
}
]
}
}
If you run very same query without trailing dot, which then becomes "yes correct", you're getting this result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.37919715,
"hits": [
{
"_index": "test",
"_type": "question",
"_id": "2",
"_score": 0.37919715,
"_source": {
"title": "title nr2",
"answer": [
{
"text": "yes correct"
}
]
}
},
{
"_index": "test",
"_type": "question",
"_id": "1",
"_score": 0.11261705,
"_source": {
"title": "title nr1",
"answer": [
{
"text": "yes correct."
}
]
}
}
]
}
}
Hopefully this is what you're looking for.
By the way, I'd recommend to always use Match query when performing text search. Taken from documentation:
Comparison to query_string / field The match family of queries
does not go through a "query parsing" process. It does not support
field name prefixes, wildcard characters, or other "advanced"
features. For this reason, chances of it failing are very small / non
existent, and it provides an excellent behavior when it comes to just
analyze and run that text as a query behavior (which is usually what a
text search box does). Also, the phrase_prefix type can provide a
great "as you type" behavior to automatically load search results.
Elasticsearch or rather Lucene scoring does not take into account the relative positioning of the tokens. It utlizes 3 different criterias to do the same
Term frequency - Frequency at which the search terms is present in
the document
Inverse document frequency - Number of occurrence of the search term
in the entire database. The more the occurance , the more the common
is the search term and less the importance it has in search
Field length normalization - Number of tokens present in the target
field.
You can learn more about it here.

Elasticsearch DSL with multiple multi_match query in ruby

I have this scenario wherein there are two multi_match searches within the same query. The trouble is, when I create the JSON for it in ruby, a json with non-unique keys doesn't seem possible so only one of them appear.
Here is my query:
{
"fields": ["id", "title",
"address.city", "address.state", "address.country", "address.state_code", "address.country_code", "proxy_titles", "location"],
"size":2,
"query":{
"filtered":{
"filter": {
"range": {
"custom_score": {
"gte": 100
}
}
},
"query":{
"bool": {
"must": {
"multi_match":{
"query": "term 1",
"type": "cross_fields",
"fields": ["title^2", "proxy_titles^2","description"]
}
},
"must": {
"multi_match": {
"query": "us",
"fields": ["address.city", "address.country", "address.state",
"address.zone", "address.country_code", "address.state_code", "address.zone_code"]
}
}
}
}
}
},
"sort": {
"_score": { "order": "desc" },
"variation": {"order": "asc"},
"updated_at": { "order": "desc" }
}
}
I have also only recently started using elasticsearch so it be very helpful if you could suggest me a better query to accomplish the same as well.
You have the syntax wrong. For multiple "must" values in a "bool", they need to be in an array. The documentation is not always terribly helpful, unfortunately (the bool query page shows this for "should" but not "must").
Try this:
{
"fields": ["id","title","address.city","address.state","address.country","address.state_code","address.country_code","proxy_titles","location"],
"size": 2,
"query": {
"filtered": {
"filter": {
"range": {
"custom_score": {
"gte": 100
}
}
},
"query": {
"bool": [
{
"must": {
"multi_match": {
"query": "term 1",
"type": "cross_fields",
"fields": ["title^2","proxy_titles^2","description"]
}
}
},
{
"must": {
"multi_match": {
"query": "us",
"fields": ["address.city","address.country","address.state","address.zone","address.country_code","address.state_code","address.zone_code"]
}
}
}
]
}
}
},
"sort": {
"_score": {
"order": "desc"
},
"variation": {
"order": "asc"
},
"updated_at": {
"order": "desc"
}
}
}

Resources