Keyword search
Keyword
search, also called "BM25 (Best match 25)" or "sparse vector" search, returns objects that have the highest BM25F scores.
Basic BM25 search
To use BM25 keyword search, define a search string.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
limit=3
)
for o in response.objects:
print(o.properties)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="food"
)
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('food', {
limit: 3,
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'food',
})
.withLimit(3)
.withFields('question answer')
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("food")
limit := int(3)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{Name: "answer"},
).
WithBM25(query).
WithLimit(limit).
Do(ctx)
{
Get {
JeopardyQuestion(
limit: 3
bm25: {
query: "food"
}
) {
question
answer
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
},
{
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"answer": "a closer grocer",
"question": "A nearer food merchant"
}
]
}
}
}
Retrieve BM25F scores
You can retrieve the BM25F score
values for each returned object.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
from weaviate.classes.query import MetadataQuery
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
return_metadata=MetadataQuery(score=True),
limit=3
)
for o in response.objects:
print(o.properties)
print(o.metadata.score)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="food"
)
.with_additional("score")
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('food', {
returnMetadata: ['score'],
limit: 3
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
console.log(object.metadata?.score);
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'food',
})
.withFields('question answer _additional { score }')
.withLimit(3)
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("food")
limit := int(3)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{Name: "answer"},
graphql.Field{
Name: "_additional",
Fields: []graphql.Field{
{Name: "score"},
},
},
).
WithBM25(query).
WithLimit(limit).
Do(ctx)
{
Get {
JeopardyQuestion(
limit: 3
bm25: {
query: "food"
}
) {
question
answer
_additional {
score
}
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.0140665"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
},
{
"_additional": {
"score": "2.8725255"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "2.7672548"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
}
]
}
}
}
Search on selected properties only
v1.19.0
A keyword search can be directed to only search a subset of object properties. In this example, the BM25 search only uses the question
property to produce the BM25F score.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
from weaviate.classes.query import MetadataQuery
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
query_properties=["question"],
return_metadata=MetadataQuery(score=True),
limit=3
)
for o in response.objects:
print(o.properties)
print(o.metadata.score)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="food",
properties=["question"]
)
.with_additional("score")
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('safety', {
queryProperties: ['question'],
returnMetadata: ['score'],
limit: 3
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
console.log(object.metadata?.score);
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'food',
properties: ['question'],
})
.withLimit(3)
.withFields('question answer _additional { score }')
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("safety").WithProperties("question")
limit := int(3)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{
Name: "_additional",
Fields: []graphql.Field{
{Name: "score"},
},
},
).
WithBM25(query).
WithLimit(limit).
Do(ctx)
{
Get {
JeopardyQuestion(
limit: 3
bm25: {
query: "food"
properties: ["question"]
}
) {
question
answer
_additional {
score
}
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.7079012"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "3.4311616"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
},
{
"_additional": {
"score": "2.8312314"
},
"answer": "honey",
"question": "The primary source of this food is the Apis mellifera"
}
]
}
}
}
Use weights to boost properties
v1.19.0
You can weight how much each property affects the overall BM25F score. This example boosts the question
property by a factor of 2 while the answer
property remains static.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
query_properties=["question^2", "answer"],
limit=3
)
for o in response.objects:
print(o.properties)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="food",
properties=["question^2", "answer"]
)
.with_additional("score")
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('food', {
queryProperties: ['question^2', 'answer'],
returnMetadata: ['score'],
limit: 3
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
console.log(object.metadata?.score);
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'food',
properties: ['question^2', 'answer'],
})
.withLimit(3)
.withFields('question answer _additional { score }')
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("food").WithProperties("question^2", "answer")
limit := int(3)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{Name: "answer"},
).
WithBM25(query).
WithLimit(limit).
Do(ctx)
{
Get {
JeopardyQuestion(
limit: 3
bm25: {
query: "food"
properties: ["question^2", "answer"]
}
) {
question
answer
_additional {
score
}
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "4.0038033"
},
"answer": "cake",
"question": "Devil's food & angel food are types of this dessert"
},
{
"_additional": {
"score": "3.8706005"
},
"answer": "a closer grocer",
"question": "A nearer food merchant"
},
{
"_additional": {
"score": "3.2457707"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other"
}
]
}
}
}
Set tokenization
The BM25 query string is tokenized before it is used to search for objects using the inverted index.
You must specify the tokenization method in the collection definition for each property.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
from weaviate.classes.config import Configure, Property, DataType, Tokenization
client.collections.create(
"Article",
vectorizer_config=Configure.Vectorizer.text2vec_huggingface(),
properties=[
Property(
name="title",
data_type=DataType.TEXT,
vectorize_property_name=True, # Use "title" as part of the value to vectorize
tokenization=Tokenization.LOWERCASE # Use "lowecase" tokenization
),
Property(
name="body",
data_type=DataType.TEXT,
skip_vectorization=True, # Don't vectorize this property
tokenization=Tokenization.WHITESPACE # Use "whitespace" tokenization
),
]
)
class_obj = {
"class": "Article",
"vectorizer": "text2vec-huggingface", # this could be any vectorizer
"properties": [
{
"name": "title",
"dataType": ["text"],
"moduleConfig": {
"text2vec-huggingface": { # this must match the vectorizer used
"vectorizePropertyName": True,
"tokenization": "lowercase"
}
}
},
{
"name": "body",
"dataType": ["text"],
"moduleConfig": {
"text2vec-huggingface": { # this must match the vectorizer used
"skip": True, # Don't vectorize body
"tokenization": "whitespace"
}
}
},
],
}
client.schema.create_class(class_obj)
import { vectorizer, dataType, tokenization } from 'weaviate-client';
const newCollection = await client.collections.create({
name: 'Article',
vectorizers: vectorizer.text2VecHuggingFace(),
properties: [
{
name: 'title',
dataType: dataType.TEXT,
vectorizePropertyName: true,
tokenization: tokenization.LOWERCASE // or 'lowercase'
},
{
name: 'body',
dataType: dataType.TEXT,
skipVectorization: true,
tokenization: tokenization.WHITESPACE // or 'whitespace'
},
],
})
const classWithPropModuleSettings = {
class: 'Article',
vectorizer: 'text2vec-huggingface', // this could be any vectorizer
properties: [
{
name: 'title',
dataType: ['text'],
moduleConfig: {
'text2vec-huggingface': {
// this must match the vectorizer used
vectorizePropertyName: true,
tokenization: 'lowercase', // Use "lowercase" tokenization
},
},
},
{
name: 'body',
dataType: ['text'],
moduleConfig: {
'text2vec-huggingface': {
// this must match the vectorizer used
skip: true, // Don't vectorize this property
tokenization: 'whitespace', // Use "whitespace" tokenization
},
},
},
],
};
// Add the class to the schema
result = await client.schema
.classCreator()
.withClass(classWithPropModuleSettings)
.do();
limit
& offset
Use limit
to set a fixed maximum number of objects to return.
Optionally, use offset
to paginate the results.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
limit=3,
offset=1
)
for o in response.objects:
print(o.properties)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="safety"
)
.with_additional("score")
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('safety', {
limit: 3,
offset: 1
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'safety',
})
.withFields('question answer _additional { score }')
.withLimit(3)
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("safety")
limit := int(3)
offset := int(1)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{Name: "answer"},
).
WithBM25(query).
WithLimit(limit).
WithOffset(offset).
Do(ctx)
{
Get {
JeopardyQuestion(
bm25: {
query: "safety"
}
limit: 3
) {
question
answer
_additional {
score
}
}
}
}
Limit result groups
To limit results to groups of similar distances to the query, use the autocut
filter to set the number of groups to return.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="safety",
auto_limit=1
)
for o in response.objects:
print(o.properties)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer"])
.with_bm25(
query="safety"
)
.with_additional("score")
.with_autocut(1)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('safety', {
autoLimit: 1,
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'safety',
})
.withFields('question answer _additional { score }')
.withAutocut(1)
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("safety")
autoLimit := int(1)
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "question"},
graphql.Field{Name: "answer"},
).
WithBM25(query).
WithAutocut(autoLimit).
Do(ctx)
{
Get {
JeopardyQuestion(
bm25: {
query: "safety"
}
autocut: 1
) {
question
answer
_additional {
score
}
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "2.6768136"
},
"answer": "OSHA (Occupational Safety and Health Administration)",
"question": "The government admin. was created in 1971 to ensure occupational health & safety standards"
}
]
}
}
}
Group results
v1.25
Define criteria to group search results.
- Python Client v4
from weaviate.classes.query import GroupBy
jeopardy = client.collections.get("JeopardyQuestion")
# Grouping parameters
group_by = GroupBy(
prop="round", # group by this property
objects_per_group=3, # maximum objects per group
number_of_groups=2, # maximum number of groups
)
# Query
response = jeopardy.query.bm25(
query="California",
group_by=group_by
)
for grp_name, grp_content in response.groups.items():
print(grp_name, grp_content.objects)
Example response
The response is like this:
'Jeopardy!'
'Double Jeopardy!'
Filter results
For more specific results, use a filter
to narrow your search.
- Python Client v4
- Python Client v3
- JS/TS Client v3
- JS/TS Client v2
- Go
- GraphQL
from weaviate.classes.query import Filter
jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
query="food",
filters=Filter.by_property("round").equal("Double Jeopardy!"),
return_properties=["answer", "question", "round"], # return these properties
limit=3
)
for o in response.objects:
print(o.properties)
response = (
client.query
.get("JeopardyQuestion", ["question", "answer", "round"])
.with_bm25(
query="food"
)
.with_where({
"path": ["round"],
"operator": "Equal",
"valueText": "Double Jeopardy!"
})
.with_additional("score")
.with_limit(3)
.do()
)
print(json.dumps(response, indent=2))
const jeopardy = client.collections.get('JeopardyQuestion');
const result = await jeopardy.query.bm25('food', {
limit: 3,
returnMetadata: ['score'],
filters: jeopardy.filter.byProperty('round').equal('Double Jeopardy!'),
returnProperties: ['question', 'answer', 'round'],
})
for (let object of result.objects) {
console.log(JSON.stringify(object.properties, null, 2));
}
result = await client.graphql
.get()
.withClassName('JeopardyQuestion')
.withBm25({
query: 'food',
})
.withWhere({
path: ['round'],
operator: 'Equal',
valueText: 'Double Jeopardy!',
})
.withLimit(3)
.withFields('question answer round _additional { score }')
.do();
console.log(JSON.stringify(result, null, 2));
ctx := context.Background()
className := "JeopardyQuestion"
query := (&graphql.BM25ArgumentBuilder{}).WithQuery("food")
limit := int(3)
filter := filters.Where().
WithPath([]string{"round"}).
WithOperator(filters.Equal).
WithValueString("Double Jeopardy!")
result, err := client.GraphQL().Get().
WithClassName(className).
WithFields(
graphql.Field{Name: "answer"},
graphql.Field{Name: "question"},
graphql.Field{Name: "round"},
).
WithBM25(query).
WithWhere(filter).
WithLimit(limit).
Do(ctx)
{
Get {
JeopardyQuestion(
limit: 3
bm25: {
query: "food"
}
where: {
path: ["round"]
operator: Equal
valueText: "Double Jeopardy!"
}
) {
question
answer
_additional {
score
}
}
}
}
Example response
The response is like this:
{
"data": {
"Get": {
"JeopardyQuestion": [
{
"_additional": {
"score": "3.0140665"
},
"answer": "food stores (supermarkets)",
"question": "This type of retail store sells more shampoo & makeup than any other",
"round": "Double Jeopardy!"
},
{
"_additional": {
"score": "1.9633813"
},
"answer": "honey",
"question": "The primary source of this food is the Apis mellifera",
"round": "Double Jeopardy!"
},
{
"_additional": {
"score": "1.6719631"
},
"answer": "pseudopods",
"question": "Amoebas use temporary extensions called these to move or to surround & engulf food",
"round": "Double Jeopardy!"
}
]
}
}
}
Tokenization
Weaviate converts filter terms into tokens. The default tokenization is word
. The word
tokenizer keeps alphanumeric characters, lowercase them and splits on whitespace. It converts a string like "Test_domain_weaviate" into "test", "domain", and "weaviate".
For details and additional tokenization methods, see Tokenization.
Related pages
Questions and feedback
If you have any questions or feedback, let us know in the user forum.