Code
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import seaborn as sns
LSA starts with representing documents as vectors in a term space. Given a corpus of \(n\) documents and a vocabulary of \(p\) terms, we construct a document-term matrix \(X \in \mathbb{R}^{n\times p}\):
\[X = [x_{i,j}]\]
where \(x_{i,j}\) represents the importance of term \(j\) in document \(i\).
# Example corpus
= [
corpus "machine learning algorithms",
"deep learning neural networks",
"statistical learning theory",
"neural networks architecture",
"statistical inference methods",
"statistical descriptive methods",
]
# Create document-term matrix
= TfidfVectorizer()
vectorizer = vectorizer.fit_transform(corpus)
X = vectorizer.get_feature_names_out()
terms
# Display as dataframe
= pd.DataFrame(X.toarray(), columns=terms)
df print("Document-Term Matrix:")
df
Document-Term Matrix:
algorithms | architecture | deep | descriptive | inference | learning | machine | methods | networks | neural | statistical | theory | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.635091 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.439681 | 0.635091 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | 0.000000 | 0.000000 | 0.595054 | 0.000000 | 0.000000 | 0.411964 | 0.000000 | 0.000000 | 0.487953 | 0.487953 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.494686 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.494686 | 0.714542 |
3 | 0.000000 | 0.653044 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.535506 | 0.535506 | 0.000000 | 0.000000 |
4 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.681722 | 0.000000 | 0.000000 | 0.559022 | 0.000000 | 0.000000 | 0.471964 | 0.000000 |
5 | 0.000000 | 0.000000 | 0.000000 | 0.681722 | 0.000000 | 0.000000 | 0.000000 | 0.559022 | 0.000000 | 0.000000 | 0.471964 | 0.000000 |
The term frequency-inverse document frequency (TF-IDF) weighting scheme is commonly used:
\[\text{tfidf}_{ij} = \text{tf}_{ij} \times \log(\frac{n}{df_j})\]
where:
LSA applies SVD to the document-term matrix:
\[X = U\Sigma V^T\]
where:
# Compute SVD
= np.linalg.svd(X.toarray(), full_matrices=False)
U, s, Vt
# Plot singular values
=(10, 4))
plt.figure(figsize'bo-')
plt.plot(s, 'Singular Values in LSA')
plt.title('Index')
plt.xlabel('Singular Value')
plt.ylabel(True)
plt.grid( plt.show()
LSA typically uses a truncated SVD to reduce to \(k\) dimensions:
\[X_k = U_k\Sigma_k V_k^T\]
This reduction:
def plot_semantic_space(U, terms, k=2):
# Project terms into semantic space
= Vt[:k, :].T
term_coords
=(12, 6))
plt.figure(figsize
# Plot terms
0], term_coords[:, 1], c='blue', alpha=0.5)
plt.scatter(term_coords[:, for i, term in enumerate(terms):
0], term_coords[i, 1]))
plt.annotate(term, (term_coords[i,
f'{k}D Semantic Space')
plt.title('First Dimension')
plt.xlabel('Second Dimension')
plt.ylabel(True)
plt.grid(
plt.show()return
# Plot 2D semantic space
plot_semantic_space(U, terms)
We can compute document similarity in the reduced space:
def compute_doc_similarity(X_reduced):
# Normalize documents
= normalize(X_reduced)
X_norm # Compute cosine similarity
= X_norm @ X_norm.T
similarity return similarity
# Reduce to k dimensions
= 2
k = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
X_k
# Compute and visualize similarities
= compute_doc_similarity(X_k)
sim_matrix
=(8, 6))
plt.figure(figsize=True, cmap='RdBu', center=0)
sns.heatmap(sim_matrix, annot'Document Similarity Matrix')
plt.title( plt.show()
For a query vector \(q \in \mathbb{R}^p\), we project it into the LSA space:
\[q_k = q^T V_k \Sigma_k^{-1}\]
This gives us the query coordinates in the reduced concept space.
The similarity between query and documents is computed as:
\[\text{sim}(q, d_i) = \frac{q_k \cdot d_{ik}}{||q_k|| \cdot ||d_{ik}||}\]
where \(d_{ik}\) is the \(i\)-th row of \(D_k\).
def process_query(query, vectorizer, Vk, Sk, Dk, k):
# Convert query to TF-IDF vector
= vectorizer.transform([query]).toarray()
q
# Project query to LSA space
= q @ Vk[:, :k] @ np.diag(1/Sk[:k])
qk
# Compute similarities
= np.dot(qk, Dk[:, :k].T)
similarities = similarities / (np.linalg.norm(qk) *
similarities =1))
np.linalg.norm(Dk[:, :k], axis
return similarities.flatten()
# Example query processing
= 2 # number of dimensions to keep
k = U[:, :k]
Uk = s[:k]
Sk = Vt.T[:, :k]
Vk = Uk * Sk # Document-concept matrix
Dk
= "statistical"
query = process_query(query, vectorizer, Vt.T, s, Uk, k)
similarities
# Print results
for doc, sim in zip(corpus, similarities):
print(f"Similarity: {sim:.3f} - {doc}")
Similarity: 0.346 - machine learning algorithms
Similarity: 0.019 - deep learning neural networks
Similarity: 0.890 - statistical learning theory
Similarity: -0.108 - neural networks architecture
Similarity: 0.994 - statistical inference methods
Similarity: 0.994 - statistical descriptive methods
The LSA similarity measure has several important properties:
\[\text{sim}(t_i, t_j) = \frac{(V_k\Sigma_k)_i \cdot (V_k\Sigma_k)_j}{||(V_k\Sigma_k)_i|| \cdot ||(V_k\Sigma_k)_j||}\]
\[\text{context}(t_i) = \sum_{j \in \text{docs}} u_{jk}\sigma_k v_{ik}\]
The optimal number of dimensions \(k\) can be chosen based on:
\[\frac{\sum_{i=1}^k \sigma_i^2}{\sum_{i=1}^r \sigma_i^2} \geq \theta\]
\[\frac{\sigma_k - \sigma_{k+1}}{\sigma_k} \leq \epsilon\]
def plot_explained_variance(s):
= (s**2).sum()
total_var = np.cumsum(s**2) / total_var
cum_var
=(8, 4))
plt.figure(figsizerange(1, len(s) + 1), cum_var, 'b-o')
plt.plot(=0.9, color='r', linestyle='--', label='90% threshold')
plt.axhline(y"Cumulative Explained Variance")
plt.title("Number of components")
plt.xlabel("Fraction of variance explained")
plt.ylabel(True)
plt.grid(
plt.legend()
plt.show()
plot_explained_variance(s)
LSA can reveal term relationships through their positions in the semantic space:
from sklearn.cluster import KMeans
def plot_term_clusters(V, terms, k=2):
# Use first k components
= Vt[:k, :].T
term_coords
# Cluster terms
= KMeans(n_clusters=3)
kmeans
kmeans.fit(term_coords)= kmeans.predict(term_coords)
clusters
=(12, 6))
plt.figure(figsize= plt.scatter(term_coords[:, 0], term_coords[:, 1],
scatter =clusters, cmap='viridis')
c
for i, term in enumerate(terms):
0], term_coords[i, 1]))
plt.annotate(term, (term_coords[i,
'Term Clusters in Semantic Space')
plt.title(
plt.colorbar(scatter)
plt.show()
#plot_term_clusters(Vt, terms)
LSA can be extended to multiple languages through parallel corpora:
The choice of dimensionality \(k\) depends on: 1. Corpus size and sparsity 2. Computational resources 3. Application requirements
def plot_explained_variance(s):
= np.cumsum(s**2) / np.sum(s**2)
var_explained
=(10, 4))
plt.figure(figsize'bo-')
plt.plot(var_explained, =0.9, color='r', linestyle='--')
plt.axhline(y'Cumulative Explained Variance Ratio')
plt.title('Number of Components')
plt.xlabel('Cumulative Explained Variance')
plt.ylabel(True)
plt.grid(
plt.show()
plot_explained_variance(s)
Text preprocessing significantly affects LSA results: - Stopword removal - Stemming/lemmatization - Case normalization - N-gram inclusion
Modern extensions include: - Probabilistic LSA (pLSA) - Latent Dirichlet Allocation (LDA) - Neural embeddings