This demonstration involves using a technique called semantic re-ranking as a means for improving the search relevance of results returned using the Wayfair WANDS dataset as a benchmark for accuracy.

In this exercise, we will use various techniques to assess search relevance by testing “Mean Absolute Percent Error” (MAPE) of our algorithms and their ability to retrieve relevant products based on various search queries.
The techniques we will use include:
-BM25
# Install the required packages into the environment
!pip install rank-bm25 sentence-transformers torch transformers
Collecting rank-bm25
Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting sentence-transformers
Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Requirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (2.1.2+cpu)
Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (4.39.3)
Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rank-bm25) (1.26.4)
Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (4.66.1)
Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.2.2)
Requirement already satisfied: scipy in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.11.4)
Requirement already satisfied: huggingface-hub>=0.15.1 in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (0.22.2)
Requirement already satisfied: Pillow in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (9.5.0)
Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch) (3.13.1)
Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch) (4.9.0)
Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch) (1.12)
Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch) (3.2.1)
Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch) (3.1.2)
Requirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch) (2024.2.0)
Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from transformers) (21.3)
Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from transformers) (6.0.1)
Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers) (2023.12.25)
Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from transformers) (2.31.0)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers) (0.15.2)
Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers) (0.4.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->transformers) (3.1.1)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->transformers) (2024.2.2)
Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (1.4.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (3.2.0)
Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch) (1.3.0)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rank-bm25, sentence-transformers
Successfully installed rank-bm25-0.2.2 sentence-transformers-2.7.0
# Import libraries
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
# Check data files
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/wayfair-wands-dataset/query.csv
/kaggle/input/wayfair-wands-dataset/product.csv
/kaggle/input/wayfair-wands-dataset/label.csv
# Compile functions for later implementation
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
# Function to lookup query matches
def get_exact_matches_for_(query):
labels = label_df[label_df['query_id'] == query]
return labels[labels['label'] == 'Exact']['product_id'].values
# Function to get tf-idf results
def get_tfidf_products(x):
return cosine_similarity(vec.transform([x]), matrix).flatten().argsort()[-10:][::-1]
#define functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k = 10):
"""
Calculate the Mean Average Precision at K (MAP@K).
Parameters:
true_ids (list): List of relevant product IDs.
predicted_ids (list): List of predicted product IDs.
k (int): Number of top elements to consider.
NOTE: IF you wish to change top k, please provide a justification for choosing the new value
Returns:
float: MAP@K score.
"""
#if either list is empty, return 0
if not len(true_ids) or not len(predicted_ids):
return 0.0
score = 0.0
num_hits = 0.0
# Calculate score
for i, p_id in enumerate(predicted_ids[:k]):
if p_id in true_ids and p_id not in predicted_ids[:i]:
num_hits += 1.0
score += num_hits / (i + 1.0)
# Return MAPE
return score / min(len(true_ids), k)
# Function to execute bm search
def execute_bm_search(q):
return np.argsort(bm25.get_scores(q.split(' ')))[-10:]
# Function to perform reranking
def execute_reranking(data, query):
# Translate query
query_embeddings = biencoder.encode(query, convert_to_tensor = True) #.cuda()
# Get cosine similarity
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k = topk)
hits = hits[0]
# Perform reranking
cross_inp = [[query, data[hit['corpus_id']]] for hit in hits]
cross_scores = crossencoderembeddingmodel.predict(cross_inp)
# Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):
hits[idx]['score'] = cross_scores[idx]
sortie = sorted(hits, key = lambda x: x['score'], reverse = True)[0:10]
return [item['corpus_id'] for item in sortie]
# Load the components
query_df = pd.read_csv("/kaggle/input/wayfair-wands-dataset/query.csv", sep = '\t')
product_df = pd.read_csv("/kaggle/input/wayfair-wands-dataset/product.csv", sep = '\t')
label_df = pd.read_csv("/kaggle/input/wayfair-wands-dataset/label.csv", sep = '\t')
# Inspect initial dataframes
product_df.head()
| product_id | product_name | product_class | category hierarchy | product_description | product_features | rating_count | average_rating | review_count | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | solid wood platform bed | Beds | Furniture / Bedroom Furniture / Beds & Headboa... | good , deep sleep can be quite difficult to ha... | overallwidth-sidetoside:64.7|dsprimaryproducts... | 15.0 | 4.5 | 15.0 |
| 1 | 1 | all-clad 7 qt . slow cooker | Slow Cookers | Kitchen & Tabletop / Small Kitchen Appliances ... | create delicious slow-cooked meals , from tend... | capacityquarts:7|producttype : slow cooker|pro... | 100.0 | 2.0 | 98.0 |
| 2 | 2 | all-clad electrics 6.5 qt . slow cooker | Slow Cookers | Kitchen & Tabletop / Small Kitchen Appliances ... | prepare home-cooked meals on any schedule with... | features : keep warm setting|capacityquarts:6.... | 208.0 | 3.0 | 181.0 |
| 3 | 3 | all-clad all professional tools pizza cutter | Slicers, Peelers And Graters | Browse By Brand / All-Clad | this original stainless tool was designed to c... | overallwidth-sidetoside:3.5|warrantylength : l... | 69.0 | 4.5 | 42.0 |
| 4 | 4 | baldwin prestige alcott passage knob with roun... | Door Knobs | Home Improvement / Doors & Door Hardware / Doo... | the hardware has a rich heritage of delivering... | compatibledoorthickness:1.375 '' |countryofori... | 70.0 | 5.0 | 42.0 |
product_df.shape
(42994, 9)
query_df.head()
| query_id | query | query_class | |
|---|---|---|---|
| 0 | 0 | salon chair | Massage Chairs |
| 1 | 1 | smart coffee table | Coffee & Cocktail Tables |
| 2 | 2 | dinosaur | Kids Wall Décor |
| 3 | 3 | turquoise pillows | Accent Pillows |
| 4 | 4 | chair and a half recliner | Recliners |
query_df.shape[0]
480
label_df.head()
| id | query_id | product_id | label | |
|---|---|---|---|---|
| 0 | 0 | 0 | 25434 | Exact |
| 1 | 1 | 0 | 12088 | Irrelevant |
| 2 | 2 | 0 | 42931 | Exact |
| 3 | 3 | 0 | 2636 | Exact |
| 4 | 4 | 0 | 42923 | Exact |
product_df.isnull().sum()
product_id 0
product_name 0
product_class 2852
category hierarchy 1556
product_description 6008
product_features 0
rating_count 9452
average_rating 9452
review_count 9452
dtype: int64
# Combine fields and process text
product_df['text'] = product_df['product_name'] + ' ' + product_df['product_description'].fillna('')
product_df['text'] = product_df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
product_df['text'] = product_df['text'].apply(lambda x: re.sub('\d+', '', x).lower().strip().replace(' ', ' '))
# Calculate TF-IDF
vec = TfidfVectorizer()
tfidf = vec.fit(product_df['text'])
matrix = tfidf.transform(product_df['text'])
# Use the product ledger to calculate exact matches for each suggestion
query_df['matches'] = query_df['query_id'].apply(get_exact_matches_for_)
# Use TF-IDF to calculate cosine similarity and return similar entries
query_df['suggestions'] = query_df['query'].apply(get_tfidf_products)
# Calclate Mean Average Precision MAPE
query_df['score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['suggestions'], k = 10), axis = 1)
# Init model
bm25 = BM25Okapi(product_df['text'].apply(lambda x: x.split(' ')))
# Add BM result to data
query_df['bm_suggestions'] = query_df['query'].apply(execute_bm_search)
# Score BM25
query_df['bm_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['bm_suggestions'], k = 10), axis = 1)
# Prepare an embedding model
queries = []
passages = []
for idx, row in query_df.iterrows():
input = f"query: {row['query']}"
queries.append(input)
for idx, row in product_df.iterrows():
input = f"passage: {row['text']}"
passages.append(input)
input_texts = queries + passages
# Init model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]
sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
# Apply embeddings to data
%time corpus_embeddings = embedder.encode(passages, convert_to_tensor = True)
Batches: 0%| | 0/1344 [00:00<?, ?it/s]
CPU times: user 31min 22s, sys: 3min 1s, total: 34min 24s
Wall time: 17min 8s
%time query_embeddings = embedder.encode(queries, convert_to_tensor = True)
Batches: 0%| | 0/15 [00:00<?, ?it/s]
CPU times: user 3.81 s, sys: 21.2 ms, total: 3.83 s
Wall time: 1.92 s
# Embed the corpus
#corpus_embeddings = corpus_embeddings.to("cuda")
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
# Embed the queries
#query_embeddings = query_embeddings.to("cuda")
query_embeddings = util.normalize_embeddings(query_embeddings)
# Comnpute cosine simi9larities
hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function = util.dot_score, top_k = 10)
# ECompile semantic similarity back into dataset
sims = [[y['corpus_id'] for y in x] for x in hits]
# Add the scores to the original data
query_df['semantic_suggestions'] = sims
query_df['semantic_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['semantic_suggestions'], k = 10), axis = 1)
# Init models for reranking
biencoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
crossencoderembeddingmodel = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
# Set sequence length
biencoder.max_seq_length = 512
# Num docs
topk = 100
modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
README.md: 0%| | 0.00/11.6k [00:00<?, ?B/s]
sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/383 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
config.json: 0%| | 0.00/794 [00:00<?, ?B/s]
pytorch_model.bin: 0%| | 0.00/90.9M [00:00<?, ?B/s]
/opt/conda/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
return self.fget.__get__(instance, owner)()
tokenizer_config.json: 0%| | 0.00/316 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
# Perform initial encoding
corpus_embeddings = biencoder.encode(product_df['text'], convert_to_tensor = True, show_progress_bar = True)
query_embeddings = biencoder.encode(query_df['query'].iloc[0], convert_to_tensor = True, show_progress_bar = True) #.cuda()
Batches: 0%| | 0/1344 [00:00<?, ?it/s]
Batches: 0%| | 0/1 [00:00<?, ?it/s]
# Extract 100 relevant passages for each query
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k = topk)
hits = hits[0]
# Inspection Re-Rank the products
cross_inp = [[query_df['query'].iloc[0], product_df['text'][hit['corpus_id']]] for hit in hits]
cross_scores = crossencoderembeddingmodel.predict(cross_inp)
# Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):
hits[idx]['score'] = cross_scores[idx]
# Examine outputs
print(f"Query Selection- {query_df['query'].iloc[0]}")
print("Top-3 Re-Ranked Hits")
hits = sorted(hits, key = lambda x: x['score'], reverse = True)
for hit in hits[0:3]:
print("\t{:.3f}\t{}".format(hit['score'], product_df['product_name'][hit['corpus_id']].replace("\n", " ")))
Batches: 0%| | 0/4 [00:00<?, ?it/s]
Query Selection- salon chair
Top-3 Re-Ranked Hits
8.191 barberpub salon massage chair
8.156 hair salon chair
7.957 reclining faux leather massage chair
%%capture
# Perform semantic reranking on the data
query_df['reranked'] = query_df['query'].apply(lambda x: execute_reranking(product_df['text'], x))
query_df['reranking_score'] = query_df.apply(lambda x: map_at_k(x['matches'], x['reranked'], k = 10), axis = 1)
print('Semantic Scoring Results- MAPE scores')
print(f"TF-IDF on the Queries was {round(query_df.loc[:, 'score'].mean(), 3)}")
print(f"BM25 on the Queries was {round(np.mean(query_df['bm_score']), 3)}")
print(f"Cosine similarity of the embeddings on the Queries was {round(query_df.loc[:, 'semantic_score'].mean(), 3)}")
print(f"Semantic ReRanking on the Queries was {round(query_df.loc[:, 'reranking_score'].mean(), 3)}")
Semantic Scoring Results- MAPE scores
TF-IDF on the Queries was 0.272
BM25 on the Queries was 0.261
Cosine similarity of the embeddings on the Queries was 0.323
Semantic ReRanking on the Queries was 0.437