Skip to main content

Keyword search

Keyword search, also called "BM25 (Best match 25)" or "sparse vector" search, returns objects that have the highest BM25F scores.

To use BM25 keyword search, define a search string.

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
limit=3
)

for o in response.objects:
print(o.properties)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
},
{
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"answer": "a closer grocer",
"question": "A nearer food merchant"
}
]
}
}
}

Retrieve BM25F scores

You can retrieve the BM25F score values for each returned object.

from weaviate.classes.query import MetadataQuery

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
return_metadata=MetadataQuery(score=True),
limit=3
)

for o in response.objects:
print(o.properties)
print(o.metadata.score)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.0140665"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
},
{
"_additional": {
"score": "2.8725255"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "2.7672548"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
}
]
}
}
}

Search on selected properties only

Added in v1.19.0

A keyword search can be directed to only search a subset of object properties. In this example, the BM25 search only uses the question property to produce the BM25F score.

from weaviate.classes.query import MetadataQuery

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
query_properties=["question"],
return_metadata=MetadataQuery(score=True),
limit=3
)

for o in response.objects:
print(o.properties)
print(o.metadata.score)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.7079012"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "3.4311616"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
},
{
"_additional": {
"score": "2.8312314"
},
"answer": "honey",
"question": "The primary source of this food is the Apis mellifera"
}
]
}
}
}

Use weights to boost properties

Added in v1.19.0

You can weight how much each property affects the overall BM25F score. This example boosts the question property by a factor of 2 while the answer property remains static.

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
query_properties=["question^2", "answer"],
limit=3
)

for o in response.objects:
print(o.properties)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "4.0038033"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "3.8706005"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
},
{
"_additional": {
"score": "3.2457707"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
}
]
}
}
}

Set tokenization

The BM25 query string is tokenized before it is used to search for objects using the inverted index.

You must specify the tokenization method in the collection definition for each property.

from weaviate.classes.config import Configure, Property, DataType, Tokenization

client.collections.create(
"Article",
vectorizer_config=Configure.Vectorizer.text2vec_huggingface(),

properties=[
Property(
name="title",
data_type=DataType.TEXT,
vectorize_property_name=True, # Use "title" as part of the value to vectorize
tokenization=Tokenization.LOWERCASE # Use "lowecase" tokenization
),
Property(
name="body",
data_type=DataType.TEXT,
skip_vectorization=True, # Don't vectorize this property
tokenization=Tokenization.WHITESPACE # Use "whitespace" tokenization
),
]
)

limit & offset

Use limit to set a fixed maximum number of objects to return.

Optionally, use offset to paginate the results.

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
limit=3,
offset=1
)

for o in response.objects:
print(o.properties)

Limit result groups

To limit results to groups of similar distances to the query, use the autocut filter to set the number of groups to return.

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
auto_limit=1
)

for o in response.objects:
print(o.properties)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "2.6768136"
},
"answer": "OSHA (Occupational Safety and Health Administration)",
"question": "The government admin. was created in 1971 to ensure occupational health & safety standards"
}
]
}
}
}

Group results

Added in v1.25

Define criteria to group search results.

from weaviate.classes.query import GroupBy

jeopardy = client.collections.get("JeopardyQuestion")

# Grouping parameters
group_by = GroupBy(
prop="round", # group by this property
objects_per_group=3, # maximum objects per group
number_of_groups=2, # maximum number of groups
)

# Query
response = jeopardy.query.bm25(
query="California",
group_by=group_by
)

for grp_name, grp_content in response.groups.items():
print(grp_name, grp_content.objects)
Example response

The response is like this:

'Jeopardy!'
'Double Jeopardy!'

Filter results

For more specific results, use a filter to narrow your search.

from weaviate.classes.query import Filter

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
filters=Filter.by_property("round").equal("Double Jeopardy!"),
return_properties=["answer", "question", "round"], # return these properties
limit=3
)

for o in response.objects:
print(o.properties)
Example response

The response is like this:

{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.0140665"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other",
"round": "Double Jeopardy!"
},
{
"_additional": {
"score": "1.9633813"
},
"answer": "honey",
"question": "The primary source of this food is the Apis mellifera",
"round": "Double Jeopardy!"
},
{
"_additional": {
"score": "1.6719631"
},
"answer": "pseudopods",
"question": "Amoebas use temporary extensions called these to move or to surround & engulf food",
"round": "Double Jeopardy!"
}
]
}
}
}

Tokenization

Weaviate converts filter terms into tokens. The default tokenization is word. The word tokenizer keeps alphanumeric characters, lowercase them and splits on whitespace. It converts a string like "Test_domain_weaviate" into "test", "domain", and "weaviate".

For details and additional tokenization methods, see Tokenization.

Questions and feedback

If you have any questions or feedback, let us know in the user forum.