Keyword search

Keyword search, also called "BM25 (Best match 25)" or "sparse vector" search, returns objects that have the highest BM25F scores.

Basic BM25 search

To use BM25 keyword search, define a search string.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="food",
    limit=3
)

for o in response.objects:
    print(o.properties)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="food"
    )
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');
const result = await myCollection.query.bm25('food',{
 limit: 3,
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'food',
  })
  .withLimit(3)
  .withFields('question answer')
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      limit: 3
      bm25: {
        query: "food"
      }
    ) {
      question
      answer
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "answer": "food stores (supermarkets)",
          "question": "This type of retail store sells more shampoo & makeup than any other"
        },
        {
          "answer": "cake",
          "question": "Devil's food & angel food are types of this dessert"
        },
        {
          "answer": "a closer grocer",
          "question": "A nearer food merchant"
        }
      ]
    }
  }
}

Retrieve BM25F scores

You can retrieve the BM25F score values for each returned object.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

from weaviate.classes.query import MetadataQuery

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="food",
    return_metadata=MetadataQuery(score=True),
    limit=3
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.score)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="food"
    )
    .with_additional("score")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('food',{
 limit: 3,
 returnMetadata: ['score']
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'food',
  })
  .withFields('question answer _additional { score }')
  .withLimit(3)
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      limit: 3
      bm25: {
        query: "food"
      }
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "_additional": {
            "score": "3.0140665"
          },
          "answer": "food stores (supermarkets)",
          "question": "This type of retail store sells more shampoo & makeup than any other"
        },
        {
          "_additional": {
            "score": "2.8725255"
          },
          "answer": "cake",
          "question": "Devil's food & angel food are types of this dessert"
        },
        {
          "_additional": {
            "score": "2.7672548"
          },
          "answer": "a closer grocer",
          "question": "A nearer food merchant"
        }
      ]
    }
  }
}

Search on selected properties only

Added in v1.19.0

A keyword search can be directed to only search a subset of object properties. In this example, the BM25 search only uses the question property to produce the BM25F score.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

from weaviate.classes.query import MetadataQuery

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="safety",
    query_properties=["question"],
    return_metadata=MetadataQuery(score=True),
    limit=3
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.score)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="food",
      properties=["question"]
    )
    .with_additional("score")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('safety',{
 limit: 3,
 queryProperties: ['question^2', 'answer'],
 returnMetadata: ['score']
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'food',
    properties: ['question'],
  })
  .withLimit(3)
  .withFields('question answer _additional { score }')
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      limit: 3
      bm25: {
        query: "food"
        properties: ["question"]
      }
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "_additional": {
            "score": "3.7079012"
          },
          "answer": "cake",
          "question": "Devil's food & angel food are types of this dessert"
        },
        {
          "_additional": {
            "score": "3.4311616"
          },
          "answer": "a closer grocer",
          "question": "A nearer food merchant"
        },
        {
          "_additional": {
            "score": "2.8312314"
          },
          "answer": "honey",
          "question": "The primary source of this food is the Apis mellifera"
        }
      ]
    }
  }
}

Use weights to boost properties

Added in v1.19.0

You can weight how much each property affects the overall BM25F score. This example boosts the question property by a factor of 2 while the answer property remains static.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="food",
    query_properties=["question^2", "answer"],
    limit=3
)

for o in response.objects:
    print(o.properties)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="food",
      properties=["question^2", "answer"]
    )
    .with_additional("score")
    .with_limit(3)
    .do()
  )

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('food',{
 limit: 3,
 returnMetadata: ['score'],
 queryProperties: ['question^2', 'answer']
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'food',
    properties: ['question^2', 'answer'],
  })
  .withLimit(3)
  .withFields('question answer _additional { score }')
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      limit: 3
      bm25: {
        query: "food"
        properties: ["question^2", "answer"]
      }
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "_additional": {
            "score": "4.0038033"
          },
          "answer": "cake",
          "question": "Devil's food & angel food are types of this dessert"
        },
        {
          "_additional": {
            "score": "3.8706005"
          },
          "answer": "a closer grocer",
          "question": "A nearer food merchant"
        },
        {
          "_additional": {
            "score": "3.2457707"
          },
          "answer": "food stores (supermarkets)",
          "question": "This type of retail store sells more shampoo & makeup than any other"
        }
      ]
    }
  }
}

Set tokenization

The BM25 query string is tokenized before it is used to search for objects using the inverted index.

You must specify the tokenization method in the collection definition for each property.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2

from weaviate.classes.config import Configure, Property, DataType, Tokenization

client.collections.create(
    "Article",
    vectorizer_config=Configure.Vectorizer.text2vec_huggingface(),

    properties=[
        Property(
            name="title",
            data_type=DataType.TEXT,
            vectorize_property_name=True,  # Use "title" as part of the value to vectorize
            tokenization=Tokenization.LOWERCASE  # Use "lowecase" tokenization
        ),
        Property(
            name="body",
            data_type=DataType.TEXT,
            skip_vectorization=True,  # Don't vectorize this property
            tokenization=Tokenization.WHITESPACE  # Use "whitespace" tokenization
        ),
    ]
)

class_obj = {
    "class": "Article",
    "vectorizer": "text2vec-huggingface",  # this could be any vectorizer
    "properties": [
        {
            "name": "title",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-huggingface": {  # this must match the vectorizer used
                    "vectorizePropertyName": True,
                    "tokenization": "lowercase"
                }
            }
        },
        {
            "name": "body",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-huggingface": {  # this must match the vectorizer used
                    "skip": True,  # Don't vectorize body
                    "tokenization": "whitespace"
                }
            }
        },
    ],
}

client.schema.create_class(class_obj)

const newCollection = await client.collections.create({
  name: 'Article',
  properties: [
    {
      name: 'title',
      dataType: weaviate.configure.dataType.TEXT,
      vectorizePropertyName: true,
      tokenization: 'lowercase'
    },
    {
      name: 'body',
      dataType: weaviate.configure.dataType.TEXT,
      skipVectorization: true,
      tokenization: 'whitespace'
    },],
  vectorizer: weaviate.configure.vectorizer.text2VecCohere(),
})
// The returned value is the full class definition, showing all defaults
console.log(JSON.stringify(newCollection, null, 2));

const classWithPropModuleSettings = {
  class: 'Article',
  vectorizer: 'text2vec-huggingface', // this could be any vectorizer
  properties: [
    {
      name: 'title',
      dataType: ['text'],
      moduleConfig: {
        'text2vec-huggingface': {
          // this must match the vectorizer used
          vectorizePropertyName: true,
          tokenization: 'lowercase', // Use "lowercase" tokenization
        },
      },
    },
    {
      name: 'body',
      dataType: ['text'],
      moduleConfig: {
        'text2vec-huggingface': {
          // this must match the vectorizer used
          skip: true, // Don't vectorize this property
          tokenization: 'whitespace', // Use "whitespace" tokenization
        },
      },
    },
  ],
};

// Add the class to the schema
result = await client.schema
  .classCreator()
  .withClass(classWithPropModuleSettings)
  .do();

// The returned value is the full class definition, showing all defaults
console.log(JSON.stringify(result, null, 2));

`limit` & `offset`

Use limit to set a fixed maximum number of objects to return.

Optionally, use offset to paginate the results.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="safety",
    limit=3
)

for o in response.objects:
    print(o.properties)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="safety"
    )
    .with_additional("score")
    .with_limit(3)
    .do()
)

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('safety',{
 limit: 3,
 returnMetadata: ['score']
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'safety',
  })
  .withFields('question answer _additional { score }')
  .withLimit(3)
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      bm25: {
        query: "safety"
      }
      limit: 3
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Limit result groups

To limit results to groups of similar distances to the query, use the autocut filter to set the number of groups to return.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="safety",
    auto_limit=1
)

for o in response.objects:
    print(o.properties)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer"])
    .with_bm25(
      query="safety"
    )
    .with_additional("score")
    .with_autocut(1)
    .do()
)

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('safety',{
 autoLimit: 1,
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'safety',
  })
  .withFields('question answer _additional { score }')
  .withAutocut(1)
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      bm25: {
        query: "safety"
      }
      autocut: 1
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "_additional": {
            "score": "2.6768136"
          },
          "answer": "OSHA (Occupational Safety and Health Administration)",
          "question": "The government admin. was created in 1971 to ensure occupational health & safety standards"
        }
      ]
    }
  }
}

Group results

Added in v1.25

Define criteria to group search results.

Python (v4)

from weaviate.classes.query import GroupBy

jeopardy = client.collections.get("JeopardyQuestion")

# Grouping parameters
group_by = GroupBy(
    prop="round",  # group by this property
    objects_per_group=3,  # maximum objects per group
    number_of_groups=2,  # maximum number of groups
)

# Query
response = jeopardy.query.bm25(
    query="California",
    group_by=group_by
)

for grp_name, grp_content in response.groups.items():
    print(grp_name, grp_content.objects)

Example response

The response is like this:

'Jeopardy!'
'Double Jeopardy!'

Filter results

For more specific results, use a filter to narrow your search.

Python Client v4
Python Client v3
JS/TS Client v3
JS/TS Client v2
GraphQL

from weaviate.classes.query import Filter

jeopardy = client.collections.get("JeopardyQuestion")
response = jeopardy.query.bm25(
    query="food",
    filters=Filter.by_property("round").equal("Double Jeopardy!"),
    return_properties=["answer", "question", "round"], # return these properties
    limit=3
)

for o in response.objects:
    print(o.properties)

response = (
    client.query
    .get("JeopardyQuestion", ["question", "answer", "round"])
    .with_bm25(
      query="food"
    )
    .with_where({
        "path": ["round"],
        "operator": "Equal",
        "valueText": "Double Jeopardy!"
    })
    .with_additional("score")
    .with_limit(3)
    .do()
  )

print(json.dumps(response, indent=2))

const myCollection = client.collections.get('JeopardyQuestion');

const result = await myCollection.query.bm25('food',{
 limit: 3,
 returnMetadata: ['score'],
 filters: myCollection.filter.byProperty('round').equal('Double Jeopardy!'),
 returnProperties: ['question', 'answer', 'round'],
})

console.log(JSON.stringify(result.objects, null, 2));

result = await client.graphql
  .get()
  .withClassName('JeopardyQuestion')
  .withBm25({
    query: 'food',
  })
  .withWhere({
    path: ['round'],
    operator: 'Equal',
    valueText: 'Double Jeopardy!',
  })
  .withLimit(3)
  .withFields('question answer round _additional { score }')
  .do();

console.log(JSON.stringify(result, null, 2));

{
  Get {
    JeopardyQuestion(
      limit: 3
      bm25: {
        query: "food"
      }
      where: {
        path: ["round"]
        operator: Equal
        valueText: "Double Jeopardy!"
      }
    ) {
      question
      answer
      _additional {
        score
      }
    }
  }
}

Example response

The response is like this:

{
  "data": {
    "Get": {
      "JeopardyQuestion": [
        {
          "_additional": {
            "score": "3.0140665"
          },
          "answer": "food stores (supermarkets)",
          "question": "This type of retail store sells more shampoo & makeup than any other",
          "round": "Double Jeopardy!"
        },
        {
          "_additional": {
            "score": "1.9633813"
          },
          "answer": "honey",
          "question": "The primary source of this food is the Apis mellifera",
          "round": "Double Jeopardy!"
        },
        {
          "_additional": {
            "score": "1.6719631"
          },
          "answer": "pseudopods",
          "question": "Amoebas use temporary extensions called these to move or to surround & engulf food",
          "round": "Double Jeopardy!"
        }
      ]
    }
  }
}

Tokenization

Weaviate converts filter terms into tokens. The default tokenization is word. The word tokenizer keeps alphanumeric characters and splits on whitespace. It converts a string like "test_domain_weaviate" into "test", "domain", and "weaviate".

For details and additional tokenization methods, see Tokenization.

Questions and feedback

If you have any questions or feedback, let us know in the user forum.

Basic BM25 search​

Retrieve BM25F scores​

Search on selected properties only​

Use weights to boost properties​

Set tokenization​

limit & offset​

Limit result groups​

Group results​

Filter results​

Tokenization​

Related pages​

Questions and feedback​

Basic BM25 search

Retrieve BM25F scores

Search on selected properties only

Use weights to boost properties

Set tokenization

`limit` & `offset`

Limit result groups

Group results

Filter results

Tokenization

Related pages

Questions and feedback