Informational Retrieval

PRACTICAL 1

Aim: - Document Indexing and Retrieval

a) Implement an inverted index construction algorithm.

b) Build a simple document retrieval system using the constructed index.

a) Implement an inverted index construction algorithm.

Code: from collections import defaultdict

# Function to build the inverted index

def build_inverted_index(documents):

inverted_index = defaultdict(list)

# Iterate over documents

for doc_id, document in enumerate(documents):

words = document.lower().split() # Convert to lowercase and split into words

for word in set(words): # Use set to avoid duplicates in a document

inverted_index[word].append(doc_id)

return inverted_index

# Sample documents

documents = [

"Information retrieval is important.",

"Document indexing is part of retrieval.",

"Retrieval systems need an inverted index."

]

# Build the inverted index

inverted_index = build_inverted_index(documents)

# Print the inverted index

for word, doc_ids in inverted_index.items():

print(f"{word}: {doc_ids}")

OUTPUT: information: [0]

retrieval: [0, 2]

is: [0, 1]

important.: [0]

part: [1]

of: [1]

document: [1]

retrieval.: [1]

indexing: [1]

index.: [2]

an: [2]

need: [2]

systems: [2]

inverted: [2]

b) Build a simple document retrieval system using the constructed index.

Code: from collections import defaultdict

# Function to build the inverted index

def build_inverted_index(documents):

inverted_index = defaultdict(list)

# Iterate over documents

for doc_id, document in enumerate(documents):

words = document.lower().split() # Convert to lowercase and split into words

for word in set(words): # Use set to avoid duplicates in a document

inverted_index[word].append(doc_id)

return inverted_index

# Function to retrieve documents for a query

def retrieve_documents(query, inverted_index):

query = query.lower().split() # Tokenize the query and convert to lowercase

doc_ids = set()

# Find documents for each word in the query

for word in query:

if word in inverted_index:

doc_ids.update(inverted_index[word])

return sorted(doc_ids)

# Sample documents

documents = [

"Information retrieval is important.",

"Document indexing is part of retrieval.",

"Retrieval systems need an inverted index."

]

# Build the inverted index

inverted_index = build_inverted_index(documents)

# Display the inverted index

print("Inverted Index:")

for word, doc_ids in inverted_index.items():

print(f"{word}: {doc_ids}")

# Search query example

query = "retrieval index"

retrieved_docs = retrieve_documents(query, inverted_index)

# Display the retrieved documents (IDs)

print(f"\nDocuments matching the query '{query}': {retrieved_docs}")

OUTPUT:

Inverted Index:

important.: [0]

information: [0]

is: [0, 1]

retrieval: [0, 2]

retrieval.: [1]

document: [1]

indexing: [1]

part: [1]

of: [1]

need: [2]

index.: [2]

an: [2]

inverted: [2]

systems: [2]

Documents matching the query 'retrieval index': [0, 2]

PRACTICAL 2

Aim:- Retrieval Models

a) Implement the Boolean retrieval model and process queries.

b) Implement the vector space model with TF-IDF weightng and cosine similarity.

a) Implement the Boolean retrieval model and process queries.

CODE: from collections import defaultdict

# Build the inverted index

def build_index(docs):

index = defaultdict(set)

for i, doc in enumerate(docs):

for word in doc.lower().split():

index[word].add(i)

return index

# Process Boolean query (AND, OR, NOT)

def process_query(query, index):

query = query.lower().split()

if 'and' in query:

terms = query[::2]

result = index[terms[0]] & index[terms[1]]

elif 'or' in query:

terms = query[::2]

result = index[terms[0]] | index[terms[1]]

elif 'not' in query:

terms = query[::2]

result = index[terms[0]] - index[terms[1]]

else:

result = index[query[0]]

return sorted(result)

# Sample documents

documents = [

"Informaton retrieval is important",

"Document indexing and retrieval help search engines",

"Retrieval systems are useful for data analysis"

]

# Build inverted index

index = build_index(documents)

# Sample query

query = "informaton and retrieval"

print(f"Result for query '{query}': {process_query(query, index)}")

OUTPUT:

Result for query 'informaton and retrieval': [0]

b) Implement the vector space model with TF-IDF weightng and cosine similarity.

CODE: import math

from collections import Counter

# Compute Term Frequency (TF)

def compute_tf(doc):

tf = Counter(doc) # Count occurrences of each word

total_terms = len(doc) # Total number of words in the document

return {word: count / total_terms for word, count in tf.items()}

# Compute Inverse Document Frequency (IDF)

def compute_idf(docs):

total_docs = len(docs)

idf = {}

all_words = set(word for doc in docs for word in doc) # Unique words across all documents

for word in all_words:

doc_freq = sum(1 for doc in docs if word in doc) # Count docs containing the word

idf[word] = math.log(total_docs / (doc_freq + 1)) # Avoid division by zero

return idf

# Compute TF-IDF for each document

def compute_tfidf(docs, idf):

tfidf_docs = []

for doc in docs:

tf = compute_tf(doc) # Get TF of the current document

tfidf = {word: tf.get(word, 0) * idf.get(word, 0) for word in doc} # TF-IDF calculation

tfidf_docs.append(tfidf)

return tfidf_docs

# Compute Cosine Similarity between two vectors

def cosine_similarity(vec1, vec2):

words = set(vec1) | set(vec2) # Combine words from both vectors

dot_product = sum(vec1.get(word, 0) * vec2.get(word, 0) for word in words) # Dot product

magnitude1 = math.sqrt(sum(val**2 for val in vec1.values())) # Magnitude of vec1

magnitude2 = math.sqrt(sum(val**2 for val in vec2.values())) # Magnitude of vec2

if magnitude1 and magnitude2:

return dot_product / (magnitude1 * magnitude2) # Cosine similarity formula

return 0

# Sample documents (list of words per document)

docs = [

["informaton", "retrieval", "is", "important"],

["document", "indexing", "and", "retrieval"],

["search", "engines", "use", "informaton"]

]

# Compute IDF and TF-IDF

idf = compute_idf(docs)

tfidf_docs = compute_tfidf(docs, idf)

# Debugging print: To check TF-IDF values

print("TF-IDF Vectors for Documents:")

for i, tfidf in enumerate(tfidf_docs):

print(f"Document {i}: {tfidf}")

# Calculate cosine similarity between Document 0 and Document 1

similarity = cosine_similarity(tfidf_docs[0], tfidf_docs[1])

print(f"Cosine Similarity between Document 0 and Document 1: {similarity:.4f}")

OUTPUT:

TF-IDF Vectors for Documents:

Document 0: {'informaton': 0.14384103622589045, 'retrieval': 0.0, 'is': 0.2876820724517809, 'important': 0.2876820724517809}

Document 1: {'document': 0.2876820724517809, 'indexing': 0.2876820724517809, 'and': 0.2876820724517809, 'retrieval': 0.0}

Document 2: {'search': 0.2876820724517809, 'engines': 0.2876820724517809, 'use': 0.2876820724517809, 'informaton': 0.14384103622589045}

Cosine Similarity between Document 0 and Document 1: 0.0000

PRACTICAL 3

Aim: - Spelling Correction in IR Systems

a) Develop a spelling correction module using edit distance algorithms.

b) Integrate the spelling correction module into an information retrieval system.

a) Develop a spelling correction module using edit distance algorithms.

Code: import difflib

def correct_spelling(word, dictionary):

suggestion = difflib.get_close_matches(word, dictionary, n=1, cutoff=0.7)

return suggestion[0] if suggestion else word

# Example dictionary

dictionary = ["information", "retrieval", "search", "query", "system", "index"]

# Test the function

misspelled_word = "retrievel"

corrected_word = correct_spelling(misspelled_word, dictionary)

print(f"Did you mean: {corrected_word}?")

OUTPUT: Did you mean: retrieval?

b) Integrate the spelling correction module into an information retrieval system.

Code: import difflib

# Spelling correction function

def correct_spelling(word, dictionary):

suggestion = difflib.get_close_matches(word, dictionary, n=1, cutoff=0.7)

return suggestion[0] if suggestion else word

# Simple IR system with a small document collection

documents = {

"doc1": "Information retrieval is important.",

"doc2": "Spelling correction improves search results.",

"doc3": "Query processing includes spell checking."

}

# Search function with spell correction

def search(query, dictionary, documents):

words = query.split()

corrected_query = " ".join(correct_spelling(word, dictionary) for word in words)

print(f"Corrected Query: {corrected_query}")

# Retrieve matching documents

results = [doc for doc, content in documents.items() if corrected_query in content]

return results if results else ["No relevant documents found."]

# Example dictionary

dictionary = ["information", "retrieval", "spelling", "correction", "query", "processing", "search",

"results"]

# Test the system

query = "retrievel system"

results = search(query, dictionary, documents)

print("Search Results:", results)

OUTPUT: Corrected Query: retrieval system

Search Results: ['No relevant documents found.']

PRACTICAL 4

Aim: - Evaluation Metrics for IR Systems

a) Calculate precision, recall, and F-measure for a given set of retrieval results.

b)Use an evaluation toolkit to measure average precision and other evaluation metrics.

a) Calculate precision, recall, and F-measure for a given set of retrieval results.

Code: def evaluate_ir(retrieved, relevant):

retrieved_set = set(retrieved)

relevant_set = set(relevant)

true_positives = len(retrieved_set & relevant_set) # Relevant and retrieved

precision = true_positives / len(retrieved) if retrieved else 0

recall = true_positives / len(relevant) if relevant else 0

f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

return precision, recall, f_measure

# Example data

retrieved_docs = ["doc1", "doc2", "doc3", "doc4"]

relevant_docs = ["doc2", "doc3", "doc5"]

# Calculate metrics

precision, recall, f_measure = evaluate_ir(retrieved_docs, relevant_docs)

print(f"Precision: {precision:.2f}")

print(f"Recall: {recall:.2f}")

print(f"F-Measure: {f_measure:.2f}")

OUTPUT: Precision: 0.50

Recall: 0.67

F-Measure: 0.57

b)Use an evaluation toolkit to measure average precision and other evaluation metrics.

Code: from sklearn.metrics import average_precision_score

# Example relevance scores (1 = relevant, 0 = not relevant)

y_true = [1, 0, 1, 1, 0, 1] # Ground truth relevance

y_scores = [0.9, 0.2, 0.8, 0.7, 0.3, 0.6] # Retrieval system scores

# Calculate Average Precision

ap = average_precision_score(y_true, y_scores)

print(f"Average Precision (AP): {ap:.2f}")

OUTPUT: Average Precision (AP): 1.00 (output may change)

PRACTICAL 5

Aim: - Text Categorization

a) Implement a text classification algorithm (e.g., Naive Bayes or Support Vector Machines).

b) Train the classifier on a labelled dataset and evaluate its performance

a) Implement a text classification algorithm (e.g., Naive Bayes or Support Vector Machines).

Code: # Sample dataset: text and labels

data = [

("I love this movie", "positive"),

("This film is terrible", "negative"),

("Amazing story", "positive"),

("Worst movie ever", "negative")

]

# Training: Count word occurrences per class

word_counts = {"positive": {}, "negative": {}}

label_counts = {"positive": 0, "negative": 0}

for text, label in data:

words = text.lower().split()

label_counts[label] += 1

for word in words:

word_counts[label][word] = word_counts[label].get(word, 0) + 1

# Simple classification function

def classify(text):

words = text.lower().split()

scores = {

"positive": label_counts["positive"],

"negative": label_counts["negative"]

}

for label in scores:

for word in words:

scores[label] += word_counts[label].get(word, 0)

return max(scores, key=scores.get)

# Test the classifier

test_text = "Amazing movie"

predicted_label = classify(test_text)

print(f"Predicted Label: {predicted_label}")

OUTPUT: Predicted Label: positive

b) Train the classifier on a labelled dataset and evaluate its performance.

Code: # Define positive and negative words manually

positive_words = {"good", "great", "excellent", "amazing", "fantastic", "delicious", "love", "best"}

negative_words = {"bad", "terrible", "worst", "awful", "hate", "horrible", "disgusting", "never"}

# Sample dataset: (text, label)

data = [

("The food was delicious", "positive"),

("I hated the taste", "negative"),

("Service was excellent", "positive"),

("It was a terrible experience", "negative"),

("The meal was fantastic", "positive"),

("I will never come back", "negative")

]

# Split into training and testing sets

train_data = data[:5] # First 5 for training

test_data = data[5:] # Last 1 for testing

# Simple keyword-based classification function

def classify(text):

words = set(text.lower().split()) # Convert text into a set of words

pos_count = len(words & positive_words) # Count matching positive words

neg_count = len(words & negative_words) # Count matching negative words

if pos_count > neg_count:

return "positive"

elif neg_count > pos_count:

return "negative"

else:

return "neutral" # If no match, classify as neutral

# Evaluate accuracy

correct = 0

for text, actual_label in test_data:

predicted_label = classify(text)

print(f"Text: '{text}' → Predicted: {predicted_label}, Actual: {actual_label}")

if predicted_label == actual_label:

correct += 1

accuracy = correct / len(test_data) * 100

print(f"Accuracy: {accuracy:.2f}%")

OUTPUT: Text: 'I will never come back' → Predicted: negative, Actual: negative

Accuracy: 100.00%

PRACTICAL 6

Aim: - Clustering for Information Retrieval

a) Implement a clustering algorithm (e.g., K-means or hierarchical clustering).

b)Apply the clustering algorithm to a set of documents and evaluate the clustering results.

a) Implement a clustering algorithm (e.g., K-means or hierarchical clustering).

Code: import random

# Sample data points (1D for simplicity)

data = [2, 4, 6, 8, 20, 22, 24, 26]

# Number of clusters

k = 2

# Initialize cluster centers randomly

centers = random.sample(data, k)

# Assign points to nearest cluster

def assign_clusters(data, centers):

clusters = {c: [] for c in centers}

for point in data:

closest_center = min(centers, key=lambda c: abs(c - point))

clusters[closest_center].append(point)

return clusters

# Update cluster centers

def update_centers(clusters):

return [sum(points) // len(points) for points in clusters.values()]

# Run K-means for a few iterations

for _ in range(5):

clusters = assign_clusters(data, centers)

centers = update_centers(clusters)

# Print final clusters

print("Final Clusters:", clusters)

OUTPUT: Final Clusters: {5: [2, 4, 6, 8], 23: [20, 22, 24, 26]}

b) Apply the clustering algorithm to a set of documents and evaluate the clustering results.

Code: # Sample documents

documents = [

"Apple and banana are fruits",

"Mango and orange are sweet fruits",

"Cars and bikes are vehicles",

"Trucks and buses are heavy vehicles"

]

# Define keywords for clustering

cluster_keywords = {

"fruits": ["apple", "banana", "mango", "orange", "fruits", "sweet"],

"vehicles": ["cars", "bikes", "trucks", "buses", "vehicles", "heavy"]

}

# Assign each document to the closest cluster

clusters = {"fruits": [], "vehicles": []}

for doc in documents:

words = doc.lower().split()

fruit_score = sum(word in cluster_keywords["fruits"] for word in words)

vehicle_score = sum(word in cluster_keywords["vehicles"] for word in words)

if fruit_score > vehicle_score:

clusters["fruits"].append(doc)

else:

clusters["vehicles"].append(doc)

# Print clustered documents

print("Clustered Documents:", clusters)

OUTPUT: Clustered Documents: {

'fruits': [

'Apple and banana are fruits',

'Mango and orange are sweet fruits'

'vehicles': [

'Cars and bikes are vehicles',

'Trucks and buses are heavy vehicles'

]

}

PRACTICAL 7

Aim: - Web Crawling and Indexing

a) Develop a web crawler to fetch and index web pages.

b) Handle challenges such as robots.txt, dynamic content, and crawling delays.

a) Develop a web crawler to fetch and index web pages.

Code: import requests

from bs4 import BeautifulSoup

# Function to crawl and index a webpage

def crawl(url):

response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

# Extract and print the page title and links

title = soup.title.string if soup.title else "No Title"

links = [a['href'] for a in soup.find_all('a', href=True)]

print(f"Title: {title}")

print("Links Found:", links)

# Start crawling

crawl("https://example.com") # Replace with any website

OUTPUT: Title: Example Domain

Links Found: ['https://www.iana.org/domains/example']

b) Handle challenges such as robots.txt, dynamic content, and crawling delays.

Code: import requests

import time

from bs4 import BeautifulSoup

from urllib.robotparser import RobotFileParser

# Function to check robots.txt

def can_crawl(url):

rp = RobotFileParser()

rp.set_url(url + "/robots.txt")

rp.read()

return rp.can_fetch("*", url)

# Function to fetch and parse web page

def crawl(url):

if not can_crawl(url):

print("Crawling not allowed by robots.txt")

return

print(f"Crawling: {url}")

response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

if response.status_code != 200:

print("Failed to fetch page")

return

soup = BeautifulSoup(response.text, "html.parser")

# Extract and print page title and links

title = soup.title.string if soup.title else "No Title"

links = [a['href'] for a in soup.find_all('a', href=True)]

print(f"Title: {title}")

print("Links Found:", links)

time.sleep(2) # Respect crawling delays

# Start crawling

crawl("https://example.com") # Replace with any real URL

OUTPUT: Crawling: https://example.com

Title: Example Domain

Links Found: ['https://www.iana.org/domains/example']

PRACTICAL 8

Aim: - Link Analysis and PageRank

a) Implement the PageRank algorithm to rank web pages based on link analysis.

b) Apply the PageRank algorithm to a small web graph and analyze the results.

a) Implement the PageRank algorithm to rank web pages based on link analysis.

Code: # Simple PageRank Implementation

web_graph = {

'A': ['B', 'C'],

'B': ['C'],

'C': ['A'],

'D': ['C']

}

# Initialize ranks

ranks = {page: 1.0 for page in web_graph}

damping_factor = 0.85

iterations = 5

# PageRank Calculation

for _ in range(iterations):

new_ranks = {}

for page in web_graph:

new_rank = (1 - damping_factor) + damping_factor * sum(

ranks[linking_page] / len(web_graph[linking_page])

for linking_page in web_graph if page in web_graph[linking_page]

)

new_ranks[page] = new_rank

ranks = new_ranks # Update ranks

# Print final PageRank values

print("Final PageRanks:", ranks)

OUTPUT: Final PageRanks: {'A': 1.5540671874999998, 'B': 0.8104785546875, 'C': 1.4854542578125, 'D': 0.15000000000000002}

b) Apply the PageRank algorithm to a small web graph and analyze the results.

Code: # Small web graph representation

web_graph = {

'A': ['B', 'C'],

'B': ['C'],

'C': ['A'],

'D': ['C']

}

# Initialize PageRank values

ranks = {page: 1.0 for page in web_graph}

damping_factor = 0.85

iterations = 5

# Calculate PageRank

for _ in range(iterations):

new_ranks = {}

for page in web_graph:

new_rank = (1 - damping_factor) + damping_factor * sum(

ranks[linking_page] / len(web_graph[linking_page])

for linking_page in web_graph if page in web_graph[linking_page]

)

new_ranks[page] = new_rank

ranks = new_ranks # Update ranks

# Analyze results by ranking pages

sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)

# Print final rankings

print("PageRank Scores:")

for page, score in sorted_ranks:

print(f"Page {page}: {score:.4f}")

OUTPUT: PageRank Scores:

Page A: 1.5541

Page C: 1.4855

Page B: 0.8105

Page D: 0.1500

PRACTICAL 9

Aim: - Learning to Rank

a) Implement a learning to rank algorithm (e.g., RankSVM or RankBoost).

b) Train the ranking model using labelled data and evaluate its effectiveness.

a) Implement a learning to rank algorithm (e.g., RankSVM or RankBoost).

Code: # Sample training data (features) and relevance scores

X = [[0.2, 0.8], [0.5, 0.4], [0.9, 0.6], [0.4, 0.7]]

y = [2, 1, 3, 2] # Higher values mean higher relevance

# Function to compute a simple ranking score

def rank_score(x):

return sum(x) # Simple sum of features as a ranking score

# Compute ranking scores for each sample

scores = [rank_score(x) for x in X]

# Sort by ranking score in descending order

ranked_results = sorted(zip(X, scores, y), key=lambda x: x[1], reverse=True)

# Print ranking results

print("Ranked Results (Feature, Score, Relevance):")

for item in ranked_results:

print(item)

OUTPUT: Ranked Results (Feature, Score, Relevance):

([0.9, 0.6], 1.5, 3)

([0.4, 0.7], 1.1, 2)

([0.2, 0.8], 1.0, 2)

([0.5, 0.4], 0.9, 1)

b) Train the ranking model using labelled data and evaluate its effectiveness.

Code: # Sample training data (features) and relevance scores

documents = [

("Doc1", [3, 5]), # Features: word frequency, keyword match

("Doc2", [1, 8]),

("Doc3", [4, 2]),

("Doc4", [2, 6])

]

relevance = {"Doc1": 3, "Doc2": 2, "Doc3": 1, "Doc4": 2} # Higher value = more relevant

# Function to compute a ranking score (weighted sum of features)

def rank_score(features):

return 0.7 * features[0] + 0.3 * features[1] # Weighted formula

# Compute ranking scores for each document

scores = [(doc, rank_score(features), relevance[doc]) for doc, features in documents]

# Sort documents based on computed ranking score

ranked_results = sorted(scores, key=lambda x: x[1], reverse=True)

# Evaluate effectiveness (check if ranking order matches relevance scores)

correct = sum(1 for i in range(len(ranked_results) - 1) if ranked_results[i][2] >= ranked_results[i +

1][2])

accuracy = (correct / (len(ranked_results) - 1)) * 100

# Print ranked results and accuracy

print("Ranked Results (Document, Score, Relevance):")

for item in ranked_results:

print(item)

print(f"Ranking Accuracy: {accuracy:.2f}%")

OUTPUT: Ranked Results (Document, Score, Relevance):

('Doc1', 3.5999999999999996, 3)

('Doc3', 3.4, 1)

('Doc4', 3.1999999999999997, 2)

('Doc2', 3.0999999999999996, 2)

Ranking Accuracy: 66.67%

PRACTICAL 10

Aim: -Advanced Topics in Information Retrieval

a) Implement a text summarization algorithm (e.g., extractive or abstractive).

b) Build a question-answering system using techniques such as information extraction.

a) Implement a text summarization algorithm (e.g., extractive or abstractive).

Code: # Sample text

text = """Text summarization is the process of condensing a document while preserving key

information.

Extractive summarization selects important sentences directly from the text, while abstractive

summarization generates new sentences.

Summarization helps improve readability and information retrieval efficiency."""

# Split text into sentences

sentences = text.split(". ")

# Compute word frequency

word_counts = {}

for sentence in sentences:

for word in sentence.split():

word_counts[word] = word_counts.get(word, 0) + 1

# Score sentences based on word frequency

sentence_scores = {

sentence: sum(word_counts.get(word, 0) for word in sentence.split())

for sentence in sentences

}

# Select the highest-scoring sentence as summary

summary = max(sentence_scores, key=sentence_scores.get)

print("Summary:", summary)

OUTPUT: Summary: Text summarization is the process of condensing a document while preserving key

information.

Extractive summarization selects important sentences directly from the text, while abstractive

summarization generates new sentences.

Summarization helps improve readability and information retrieval efficiency.

b) Build a question-answering system using techniques such as information extraction.

Code: # Sample knowledge base (text and answers)

data = {

"What is text summarization?": "Text summarization is the process of condensing a document while preserving key information.",

"What is extractive summarization?": "Extractive summarization selects important sentences directly from the text.",

"What is abstractive summarization?": "Abstractive summarization generates new sentences to convey the main idea."

}

# Function to find the best answer

def answer_question(question):

return data.get(question, "Sorry, I don't know the answer.")

# User input

question = input("Ask a question: ")

print("Answer:", answer_question(question))

OUTPUT: Ask a question: What is extractive summarization?

Answer: Extractive summarization selects important sentences directly from the text.

THE END

DeKoder

Informational Retrieval

Post a Comment

Data Science

Made with Love by

Contact form

Informational Retrieval

You may like these posts

Post a Comment

Contact form