PRACTICAL 1
Aim: - Document Indexing and Retrieval
a) Implement an inverted index construction algorithm.
b) Build a simple document retrieval system using the
constructed index.
a) Implement an inverted index construction algorithm.
Code: from collections import defaultdict
# Function to build the inverted index
def build_inverted_index(documents):
inverted_index = defaultdict(list)
# Iterate over documents
for doc_id, document in enumerate(documents):
words = document.lower().split() # Convert to lowercase and split into words
for word in set(words): # Use set to avoid duplicates in a document
inverted_index[word].append(doc_id)
return inverted_index
# Sample documents
documents = [
"Information retrieval is important.",
"Document indexing is part of retrieval.",
"Retrieval systems need an inverted index."
]
# Build the inverted index
inverted_index = build_inverted_index(documents)
# Print the inverted index
for word, doc_ids in inverted_index.items():
print(f"{word}: {doc_ids}")
OUTPUT: information: [0]
retrieval: [0, 2]
is: [0, 1]
important.: [0]
part: [1]
of: [1]
document: [1]
retrieval.: [1]
indexing: [1]
index.: [2]
an: [2]
need: [2]
systems: [2]
inverted: [2]
b) Build a simple document retrieval system using the
constructed index.
Code: from collections import defaultdict
# Function to build the inverted index
def build_inverted_index(documents):
inverted_index = defaultdict(list)
# Iterate over documents
for doc_id, document in enumerate(documents):
words = document.lower().split() # Convert to lowercase and split into words
for word in set(words): # Use set to avoid duplicates in a document
inverted_index[word].append(doc_id)
return inverted_index
# Function to retrieve documents for a query
def retrieve_documents(query, inverted_index):
query = query.lower().split() # Tokenize the query and convert to lowercase
doc_ids = set()
# Find documents for each word in the query
for word in query:
if word in inverted_index:
doc_ids.update(inverted_index[word])
return sorted(doc_ids)
# Sample documents
documents = [
"Information retrieval is important.",
"Document indexing is part of retrieval.",
"Retrieval systems need an inverted index."
]
# Build the inverted index
inverted_index = build_inverted_index(documents)
# Display the inverted index
print("Inverted Index:")
for word, doc_ids in inverted_index.items():
print(f"{word}: {doc_ids}")
# Search query example
query = "retrieval index"
retrieved_docs = retrieve_documents(query, inverted_index)
# Display the retrieved documents (IDs)
print(f"\nDocuments matching the query '{query}': {retrieved_docs}")
OUTPUT:
Inverted Index:
important.: [0]
information: [0]
is: [0, 1]
retrieval: [0, 2]
retrieval.: [1]
document: [1]
indexing: [1]
part: [1]
of: [1]
need: [2]
index.: [2]
an: [2]
inverted: [2]
systems: [2]
Documents matching the query 'retrieval index': [0, 2]
PRACTICAL 2
Aim:- Retrieval Models
a) Implement the Boolean retrieval model and process queries.
b) Implement the vector space model with TF-IDF weightng and
cosine similarity.
a) Implement the Boolean retrieval model and process queries.
CODE: from collections import defaultdict
# Build the inverted index
def build_index(docs):
index = defaultdict(set)
for i, doc in enumerate(docs):
for word in doc.lower().split():
index[word].add(i)
return index
# Process Boolean query (AND, OR, NOT)
def process_query(query, index):
query = query.lower().split()
if 'and' in query:
terms = query[::2]
result = index[terms[0]] & index[terms[1]]
elif 'or' in query:
terms = query[::2]
result = index[terms[0]] | index[terms[1]]
elif 'not' in query:
terms = query[::2]
result = index[terms[0]] - index[terms[1]]
else:
result = index[query[0]]
return sorted(result)
# Sample documents
documents = [
"Informaton retrieval is important",
"Document indexing and retrieval help search engines",
"Retrieval systems are useful for data analysis"
]
# Build inverted index
index = build_index(documents)
# Sample query
query = "informaton and retrieval"
print(f"Result for query '{query}': {process_query(query, index)}")
OUTPUT:
Result for query 'informaton and retrieval': [0]
b) Implement the vector space model with TF-IDF weightng and
cosine similarity.
CODE: import math
from collections import Counter
# Compute Term Frequency (TF)
def compute_tf(doc):
tf = Counter(doc) # Count occurrences of each word
total_terms = len(doc) # Total number of words in the document
return {word: count / total_terms for word, count in tf.items()}
# Compute Inverse Document Frequency (IDF)
def compute_idf(docs):
total_docs = len(docs)
idf = {}
all_words = set(word for doc in docs for word in doc) # Unique words across all documents
for word in all_words:
doc_freq = sum(1 for doc in docs if word in doc) # Count docs containing the word
idf[word] = math.log(total_docs / (doc_freq + 1)) # Avoid division by zero
return idf
# Compute TF-IDF for each document
def compute_tfidf(docs, idf):
tfidf_docs = []
for doc in docs:
tf = compute_tf(doc) # Get TF of the current document
tfidf = {word: tf.get(word, 0) * idf.get(word, 0) for word in doc} # TF-IDF calculation
tfidf_docs.append(tfidf)
return tfidf_docs
# Compute Cosine Similarity between two vectors
def cosine_similarity(vec1, vec2):
words = set(vec1) | set(vec2) # Combine words from both vectors
dot_product = sum(vec1.get(word, 0) * vec2.get(word, 0) for word in words) # Dot product
magnitude1 = math.sqrt(sum(val**2 for val in vec1.values())) # Magnitude of vec1
magnitude2 = math.sqrt(sum(val**2 for val in vec2.values())) # Magnitude of vec2
if magnitude1 and magnitude2:
return dot_product / (magnitude1 * magnitude2) # Cosine similarity formula
return 0
# Sample documents (list of words per document)
docs = [
["informaton", "retrieval", "is", "important"],
["document", "indexing", "and", "retrieval"],
["search", "engines", "use", "informaton"]
]
# Compute IDF and TF-IDF
idf = compute_idf(docs)
tfidf_docs = compute_tfidf(docs, idf)
# Debugging print: To check TF-IDF values
print("TF-IDF Vectors for Documents:")
for i, tfidf in enumerate(tfidf_docs):
print(f"Document {i}: {tfidf}")
# Calculate cosine similarity between Document 0 and Document 1
similarity = cosine_similarity(tfidf_docs[0], tfidf_docs[1])
print(f"Cosine Similarity between Document 0 and Document 1: {similarity:.4f}")
OUTPUT:
TF-IDF Vectors for Documents:
Document 0: {'informaton': 0.14384103622589045, 'retrieval': 0.0, 'is': 0.2876820724517809, 'important': 0.2876820724517809}
Document 1: {'document': 0.2876820724517809, 'indexing': 0.2876820724517809, 'and': 0.2876820724517809, 'retrieval': 0.0}
Document 2: {'search': 0.2876820724517809, 'engines': 0.2876820724517809, 'use': 0.2876820724517809, 'informaton': 0.14384103622589045}
Cosine Similarity between Document 0 and Document 1: 0.0000
PRACTICAL 3
Aim: - Spelling Correction in IR Systems
a) Develop a spelling correction module using edit distance
algorithms.
b) Integrate the spelling correction module into an
information retrieval system.
a) Develop a spelling correction module using edit distance
algorithms.
Code: import difflib
def correct_spelling(word, dictionary):
suggestion = difflib.get_close_matches(word, dictionary, n=1, cutoff=0.7)
return suggestion[0] if suggestion else word
# Example dictionary
dictionary = ["information", "retrieval", "search", "query", "system", "index"]
# Test the function
misspelled_word = "retrievel"
corrected_word = correct_spelling(misspelled_word, dictionary)
print(f"Did you mean: {corrected_word}?")
OUTPUT: Did you mean: retrieval?
b) Integrate the spelling correction module into an
information retrieval system.
Code: import difflib
# Spelling correction function
def correct_spelling(word, dictionary):
suggestion = difflib.get_close_matches(word, dictionary, n=1, cutoff=0.7)
return suggestion[0] if suggestion else word
# Simple IR system with a small document collection
documents = {
"doc1": "Information retrieval is important.",
"doc2": "Spelling correction improves search results.",
"doc3": "Query processing includes spell checking."
}
# Search function with spell correction
def search(query, dictionary, documents):
words = query.split()
corrected_query = " ".join(correct_spelling(word, dictionary) for word in words)
print(f"Corrected Query: {corrected_query}")
# Retrieve matching documents
results = [doc for doc, content in documents.items() if corrected_query in content]
return results if results else ["No relevant documents found."]
# Example dictionary
dictionary = ["information", "retrieval", "spelling", "correction", "query", "processing", "search",
"results"]
# Test the system
query = "retrievel system"
results = search(query, dictionary, documents)
print("Search Results:", results)
OUTPUT: Corrected Query: retrieval system
Search Results: ['No relevant documents found.']
PRACTICAL 4
Aim: - Evaluation Metrics for IR Systems
a) Calculate precision, recall, and F-measure for a given set
of retrieval results.
b)Use an evaluation toolkit to measure average precision
and other evaluation metrics.
a) Calculate precision, recall, and F-measure for a given set
of retrieval results.
Code: def evaluate_ir(retrieved, relevant):
retrieved_set = set(retrieved)
relevant_set = set(relevant)
true_positives = len(retrieved_set & relevant_set) # Relevant and retrieved
precision = true_positives / len(retrieved) if retrieved else 0
recall = true_positives / len(relevant) if relevant else 0
f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0
return precision, recall, f_measure
# Example data
retrieved_docs = ["doc1", "doc2", "doc3", "doc4"]
relevant_docs = ["doc2", "doc3", "doc5"]
# Calculate metrics
precision, recall, f_measure = evaluate_ir(retrieved_docs, relevant_docs)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F-Measure: {f_measure:.2f}")
OUTPUT: Precision: 0.50
Recall: 0.67
F-Measure: 0.57
b)Use an evaluation toolkit to measure average precision
and other evaluation metrics.
Code: from sklearn.metrics import average_precision_score# Example relevance scores (1 = relevant, 0 = not relevant)
y_true = [1, 0, 1, 1, 0, 1] # Ground truth relevance
y_scores = [0.9, 0.2, 0.8, 0.7, 0.3, 0.6] # Retrieval system scores
# Calculate Average Precision
ap = average_precision_score(y_true, y_scores)
print(f"Average Precision (AP): {ap:.2f}")
OUTPUT: Average Precision (AP): 1.00 (output may change)
PRACTICAL 5
Aim: - Text Categorization
a) Implement a text classification algorithm (e.g., Naive
Bayes or Support Vector Machines).
b) Train the classifier on a labelled dataset and evaluate its
performance
a) Implement a text classification algorithm (e.g., Naive
Bayes or Support Vector Machines).
Code: # Sample dataset: text and labels
data = [
("I love this movie", "positive"),
("This film is terrible", "negative"),
("Amazing story", "positive"),
("Worst movie ever", "negative")
]
# Training: Count word occurrences per class
word_counts = {"positive": {}, "negative": {}}
label_counts = {"positive": 0, "negative": 0}
for text, label in data:
words = text.lower().split()
label_counts[label] += 1
for word in words:
word_counts[label][word] = word_counts[label].get(word, 0) + 1
# Simple classification function
def classify(text):
words = text.lower().split()
scores = {
"positive": label_counts["positive"],
"negative": label_counts["negative"]
}
for label in scores:
for word in words:
scores[label] += word_counts[label].get(word, 0)
return max(scores, key=scores.get)
# Test the classifier
test_text = "Amazing movie"
predicted_label = classify(test_text)
print(f"Predicted Label: {predicted_label}")
OUTPUT: Predicted Label: positive
b) Train the classifier on a labelled dataset and evaluate its
performance.
Code: # Define positive and negative words manually
positive_words = {"good", "great", "excellent", "amazing", "fantastic", "delicious", "love", "best"}
negative_words = {"bad", "terrible", "worst", "awful", "hate", "horrible", "disgusting", "never"}
# Sample dataset: (text, label)
data = [
("The food was delicious", "positive"),
("I hated the taste", "negative"),
("Service was excellent", "positive"),
("It was a terrible experience", "negative"),
("The meal was fantastic", "positive"),
("I will never come back", "negative")
]
# Split into training and testing sets
train_data = data[:5] # First 5 for training
test_data = data[5:] # Last 1 for testing
# Simple keyword-based classification function
def classify(text):
words = set(text.lower().split()) # Convert text into a set of words
pos_count = len(words & positive_words) # Count matching positive words
neg_count = len(words & negative_words) # Count matching negative words
if pos_count > neg_count:
return "positive"
elif neg_count > pos_count:
return "negative"
else:
return "neutral" # If no match, classify as neutral
# Evaluate accuracy
correct = 0
for text, actual_label in test_data:
predicted_label = classify(text)
print(f"Text: '{text}' → Predicted: {predicted_label}, Actual: {actual_label}")
if predicted_label == actual_label:
correct += 1
accuracy = correct / len(test_data) * 100
print(f"Accuracy: {accuracy:.2f}%")
OUTPUT: Text: 'I will never come back' → Predicted: negative, Actual: negative
Accuracy: 100.00%
PRACTICAL 6
Aim: - Clustering for Information Retrieval
a) Implement a clustering algorithm (e.g., K-means or
hierarchical clustering).
b)Apply the clustering algorithm to a set of documents
and evaluate the clustering results.
a) Implement a clustering algorithm (e.g., K-means or
hierarchical clustering).
Code: import random
# Sample data points (1D for simplicity)
data = [2, 4, 6, 8, 20, 22, 24, 26]
# Number of clusters
k = 2
# Initialize cluster centers randomly
centers = random.sample(data, k)
# Assign points to nearest cluster
def assign_clusters(data, centers):
clusters = {c: [] for c in centers}
for point in data:
closest_center = min(centers, key=lambda c: abs(c - point))
clusters[closest_center].append(point)
return clusters
# Update cluster centers
def update_centers(clusters):
return [sum(points) // len(points) for points in clusters.values()]
# Run K-means for a few iterations
for _ in range(5):
clusters = assign_clusters(data, centers)
centers = update_centers(clusters)
# Print final clusters
print("Final Clusters:", clusters)
OUTPUT: Final Clusters: {5: [2, 4, 6, 8], 23: [20, 22, 24, 26]}
b) Apply the clustering algorithm to a set of documents and
evaluate the clustering results.
Code: # Sample documents
documents = [
"Apple and banana are fruits",
"Mango and orange are sweet fruits",
"Cars and bikes are vehicles",
"Trucks and buses are heavy vehicles"
]
# Define keywords for clustering
cluster_keywords = {
"fruits": ["apple", "banana", "mango", "orange", "fruits", "sweet"],
"vehicles": ["cars", "bikes", "trucks", "buses", "vehicles", "heavy"]
}
# Assign each document to the closest cluster
clusters = {"fruits": [], "vehicles": []}
for doc in documents:
words = doc.lower().split()
fruit_score = sum(word in cluster_keywords["fruits"] for word in words)
vehicle_score = sum(word in cluster_keywords["vehicles"] for word in words)
if fruit_score > vehicle_score:
clusters["fruits"].append(doc)
else:
clusters["vehicles"].append(doc)
# Print clustered documents
print("Clustered Documents:", clusters)
OUTPUT: Clustered Documents: {
'fruits': [
'Apple and banana are fruits',
'Mango and orange are sweet fruits'
],
'vehicles': [
'Cars and bikes are vehicles',
'Trucks and buses are heavy vehicles'
]
}
PRACTICAL 7
Aim: - Web Crawling and Indexing
a) Develop a web crawler to fetch and index web
pages.
b) Handle challenges such as robots.txt, dynamic
content, and crawling delays.
a) Develop a web crawler to fetch and index web
pages.
Code: import requests
from bs4 import BeautifulSoup
# Function to crawl and index a webpage
def crawl(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# Extract and print the page title and links
title = soup.title.string if soup.title else "No Title"
links = [a['href'] for a in soup.find_all('a', href=True)]
print(f"Title: {title}")
print("Links Found:", links)
# Start crawling
crawl("https://example.com") # Replace with any website
OUTPUT: Title: Example Domain
Links Found: ['https://www.iana.org/domains/example']
b) Handle challenges such as robots.txt, dynamic
content, and crawling delays.
Code: import requests
import time
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
# Function to check robots.txt
def can_crawl(url):
rp = RobotFileParser()
rp.set_url(url + "/robots.txt")
rp.read()
return rp.can_fetch("*", url)
# Function to fetch and parse web page
def crawl(url):
if not can_crawl(url):
print("Crawling not allowed by robots.txt")
return
print(f"Crawling: {url}")
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
print("Failed to fetch page")
return
soup = BeautifulSoup(response.text, "html.parser")
# Extract and print page title and links
title = soup.title.string if soup.title else "No Title"
links = [a['href'] for a in soup.find_all('a', href=True)]
print(f"Title: {title}")
print("Links Found:", links)
time.sleep(2) # Respect crawling delays
# Start crawling
crawl("https://example.com") # Replace with any real URL
OUTPUT: Crawling: https://example.com
Title: Example Domain
Links Found: ['https://www.iana.org/domains/example']
PRACTICAL 8
Aim: - Link Analysis and PageRank
a) Implement the PageRank algorithm to rank web pages
based on link analysis.
b) Apply the PageRank algorithm to a small web graph and
analyze the results.
a) Implement the PageRank algorithm to rank web pages
based on link analysis.
Code: # Simple PageRank Implementation
web_graph = {
'A': ['B', 'C'],
'B': ['C'],
'C': ['A'],
'D': ['C']
}
# Initialize ranks
ranks = {page: 1.0 for page in web_graph}
damping_factor = 0.85
iterations = 5
# PageRank Calculation
for _ in range(iterations):
new_ranks = {}
for page in web_graph:
new_rank = (1 - damping_factor) + damping_factor * sum(
ranks[linking_page] / len(web_graph[linking_page])
for linking_page in web_graph if page in web_graph[linking_page]
)
new_ranks[page] = new_rank
ranks = new_ranks # Update ranks
# Print final PageRank values
print("Final PageRanks:", ranks)
OUTPUT: Final PageRanks: {'A': 1.5540671874999998, 'B': 0.8104785546875, 'C': 1.4854542578125, 'D': 0.15000000000000002}
b) Apply the PageRank algorithm to a small web graph and
analyze the results.
Code: # Small web graph representation
web_graph = {
'A': ['B', 'C'],
'B': ['C'],
'C': ['A'],
'D': ['C']
}
# Initialize PageRank values
ranks = {page: 1.0 for page in web_graph}
damping_factor = 0.85
iterations = 5
# Calculate PageRank
for _ in range(iterations):
new_ranks = {}
for page in web_graph:
new_rank = (1 - damping_factor) + damping_factor * sum(
ranks[linking_page] / len(web_graph[linking_page])
for linking_page in web_graph if page in web_graph[linking_page]
)
new_ranks[page] = new_rank
ranks = new_ranks # Update ranks
# Analyze results by ranking pages
sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
# Print final rankings
print("PageRank Scores:")
for page, score in sorted_ranks:
print(f"Page {page}: {score:.4f}")
OUTPUT: PageRank Scores:
Page A: 1.5541
Page C: 1.4855
Page B: 0.8105
Page D: 0.1500
PRACTICAL 9
Aim: - Learning to Rank
a) Implement a learning to rank algorithm (e.g., RankSVM or
RankBoost).
b) Train the ranking model using labelled data and evaluate
its effectiveness.
a) Implement a learning to rank algorithm (e.g., RankSVM or
RankBoost).
Code: # Sample training data (features) and relevance scores
X = [[0.2, 0.8], [0.5, 0.4], [0.9, 0.6], [0.4, 0.7]]
y = [2, 1, 3, 2] # Higher values mean higher relevance
# Function to compute a simple ranking score
def rank_score(x):
return sum(x) # Simple sum of features as a ranking score
# Compute ranking scores for each sample
scores = [rank_score(x) for x in X]
# Sort by ranking score in descending order
ranked_results = sorted(zip(X, scores, y), key=lambda x: x[1], reverse=True)
# Print ranking results
print("Ranked Results (Feature, Score, Relevance):")
for item in ranked_results:
print(item)
OUTPUT: Ranked Results (Feature, Score, Relevance):
([0.9, 0.6], 1.5, 3)
([0.4, 0.7], 1.1, 2)
([0.2, 0.8], 1.0, 2)
([0.5, 0.4], 0.9, 1)
b) Train the ranking model using labelled data and evaluate
its effectiveness.
Code: # Sample training data (features) and relevance scores
documents = [
("Doc1", [3, 5]), # Features: word frequency, keyword match
("Doc2", [1, 8]),
("Doc3", [4, 2]),
("Doc4", [2, 6])
]
relevance = {"Doc1": 3, "Doc2": 2, "Doc3": 1, "Doc4": 2} # Higher value = more relevant
# Function to compute a ranking score (weighted sum of features)
def rank_score(features):
return 0.7 * features[0] + 0.3 * features[1] # Weighted formula
# Compute ranking scores for each document
scores = [(doc, rank_score(features), relevance[doc]) for doc, features in documents]
# Sort documents based on computed ranking score
ranked_results = sorted(scores, key=lambda x: x[1], reverse=True)
# Evaluate effectiveness (check if ranking order matches relevance scores)
correct = sum(1 for i in range(len(ranked_results) - 1) if ranked_results[i][2] >= ranked_results[i +
1][2])
accuracy = (correct / (len(ranked_results) - 1)) * 100
# Print ranked results and accuracy
print("Ranked Results (Document, Score, Relevance):")
for item in ranked_results:
print(item)
print(f"Ranking Accuracy: {accuracy:.2f}%")
OUTPUT: Ranked Results (Document, Score, Relevance):
('Doc1', 3.5999999999999996, 3)
('Doc3', 3.4, 1)
('Doc4', 3.1999999999999997, 2)
('Doc2', 3.0999999999999996, 2)
Ranking Accuracy: 66.67%
PRACTICAL 10
Aim: -Advanced Topics in Information Retrieval
a) Implement a text summarization algorithm (e.g.,
extractive or abstractive).
b) Build a question-answering system using techniques such
as information extraction.
a) Implement a text summarization algorithm (e.g.,
extractive or abstractive).
Code: # Sample text
text = """Text summarization is the process of condensing a document while preserving key
information.
Extractive summarization selects important sentences directly from the text, while abstractive
summarization generates new sentences.
Summarization helps improve readability and information retrieval efficiency."""
# Split text into sentences
sentences = text.split(". ")
# Compute word frequency
word_counts = {}
for sentence in sentences:
for word in sentence.split():
word_counts[word] = word_counts.get(word, 0) + 1
# Score sentences based on word frequency
sentence_scores = {
sentence: sum(word_counts.get(word, 0) for word in sentence.split())
for sentence in sentences
}
# Select the highest-scoring sentence as summary
summary = max(sentence_scores, key=sentence_scores.get)
print("Summary:", summary)
OUTPUT: Summary: Text summarization is the process of condensing a document while preserving key
information.
Extractive summarization selects important sentences directly from the text, while abstractive
summarization generates new sentences.
Summarization helps improve readability and information retrieval efficiency.
b) Build a question-answering system using techniques such
as information extraction.
Code: # Sample knowledge base (text and answers)
data = {
"What is text summarization?": "Text summarization is the process of condensing a document while preserving key information.",
"What is extractive summarization?": "Extractive summarization selects important sentences directly from the text.",
"What is abstractive summarization?": "Abstractive summarization generates new sentences to convey the main idea."
}
# Function to find the best answer
def answer_question(question):
return data.get(question, "Sorry, I don't know the answer.")
# User input
question = input("Ask a question: ")
print("Answer:", answer_question(question))
OUTPUT: Ask a question: What is extractive summarization?
Answer: Extractive summarization selects important sentences directly from the text.
THE END