KMP Algorithm, Tries, Suffix Trees, Search Engines
The KMP algorithm efficiently finds occurrences of a pattern within a text by utilizing information about the pattern itself to skip characters during the search.
Watch how KMP efficiently searches for patterns:
#include
#include
#include
class KMPAlgorithm {
public:
// Compute failure function (LPS array)
static std::vector computeFailureFunction(const std::string& pattern) {
int m = pattern.length();
std::vector failure(m, 0);
int j = 0;
for (int i = 1; i < m; i++) {
while (j > 0 && pattern[i] != pattern[j]) {
j = failure[j - 1];
}
if (pattern[i] == pattern[j]) {
j++;
}
failure[i] = j;
}
return failure;
}
// KMP search algorithm
static std::vector search(const std::string& text, const std::string& pattern) {
int n = text.length();
int m = pattern.length();
std::vector matches;
if (m == 0) return matches;
// Compute failure function
std::vector failure = computeFailureFunction(pattern);
int j = 0; // pattern index
for (int i = 0; i < n; i++) {
while (j > 0 && text[i] != pattern[j]) {
j = failure[j - 1];
}
if (text[i] == pattern[j]) {
j++;
}
if (j == m) {
matches.push_back(i - m + 1);
j = failure[j - 1];
}
}
return matches;
}
// Display failure function
static void displayFailureFunction(const std::string& pattern) {
std::vector failure = computeFailureFunction(pattern);
std::cout << "Pattern: ";
for (char c : pattern) {
std::cout << c << " ";
}
std::cout << std::endl;
std::cout << "Failure: ";
for (int f : failure) {
std::cout << f << " ";
}
std::cout << std::endl;
}
};
A Trie (prefix tree) is a tree-like data structure used to store and search strings efficiently. Each node represents a character, and paths from root to leaves represent complete strings.
Build a trie and search for words:
#include
#include
#include
#include
class TrieNode {
public:
std::unordered_map children;
bool isEndOfWord;
TrieNode() : isEndOfWord(false) {}
~TrieNode() {
for (auto& pair : children) {
delete pair.second;
}
}
};
class Trie {
private:
TrieNode* root;
public:
Trie() {
root = new TrieNode();
}
~Trie() {
delete root;
}
void insert(const std::string& word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new TrieNode();
}
current = current->children[c];
}
current->isEndOfWord = true;
}
bool search(const std::string& word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
return false;
}
current = current->children[c];
}
return current->isEndOfWord;
}
bool startsWith(const std::string& prefix) {
TrieNode* current = root;
for (char c : prefix) {
if (current->children.find(c) == current->children.end()) {
return false;
}
current = current->children[c];
}
return true;
}
std::vector autoComplete(const std::string& prefix) {
std::vector results;
TrieNode* current = root;
// Navigate to prefix node
for (char c : prefix) {
if (current->children.find(c) == current->children.end()) {
return results; // Prefix not found
}
current = current->children[c];
}
// Collect all words with this prefix
collectWords(current, prefix, results);
return results;
}
private:
void collectWords(TrieNode* node, const std::string& currentWord,
std::vector& results) {
if (node->isEndOfWord) {
results.push_back(currentWord);
}
for (auto& pair : node->children) {
collectWords(pair.second, currentWord + pair.first, results);
}
}
public:
void display() {
std::cout << "Trie contents:" << std::endl;
displayHelper(root, "");
}
private:
void displayHelper(TrieNode* node, const std::string& prefix) {
if (node->isEndOfWord) {
std::cout << prefix << std::endl;
}
for (auto& pair : node->children) {
displayHelper(pair.second, prefix + pair.first);
}
}
};
Compressed tries optimize space by merging nodes with single children, storing edge labels instead of individual characters.
#include
#include
#include
class CompressedTrieNode {
public:
std::unordered_map children;
std::string edgeLabel;
bool isEndOfWord;
CompressedTrieNode(const std::string& label = "")
: edgeLabel(label), isEndOfWord(false) {}
~CompressedTrieNode() {
for (auto& pair : children) {
delete pair.second;
}
}
};
class CompressedTrie {
private:
CompressedTrieNode* root;
public:
CompressedTrie() {
root = new CompressedTrieNode();
}
~CompressedTrie() {
delete root;
}
void insert(const std::string& word) {
insertHelper(root, word, 0);
}
private:
void insertHelper(CompressedTrieNode* node, const std::string& word, int index) {
if (index == word.length()) {
node->isEndOfWord = true;
return;
}
char firstChar = word[index];
if (node->children.find(firstChar) == node->children.end()) {
// Create new node with remaining suffix
node->children[firstChar] =
new CompressedTrieNode(word.substr(index));
node->children[firstChar]->isEndOfWord = true;
return;
}
CompressedTrieNode* child = node->children[firstChar];
std::string& edgeLabel = child->edgeLabel;
// Find common prefix
int commonLen = 0;
int minLen = std::min(edgeLabel.length(), word.length() - index);
while (commonLen < minLen &&
edgeLabel[commonLen] == word[index + commonLen]) {
commonLen++;
}
if (commonLen == edgeLabel.length()) {
// Edge label is prefix of remaining word
insertHelper(child, word, index + commonLen);
} else {
// Split the edge
CompressedTrieNode* newNode =
new CompressedTrieNode(edgeLabel.substr(commonLen));
newNode->children = std::move(child->children);
newNode->isEndOfWord = child->isEndOfWord;
child->children.clear();
child->children[edgeLabel[commonLen]] = newNode;
child->edgeLabel = edgeLabel.substr(0, commonLen);
child->isEndOfWord = (index + commonLen == word.length());
if (index + commonLen < word.length()) {
char nextChar = word[index + commonLen];
child->children[nextChar] =
new CompressedTrieNode(word.substr(index + commonLen));
child->children[nextChar]->isEndOfWord = true;
}
}
}
public:
bool search(const std::string& word) {
return searchHelper(root, word, 0);
}
private:
bool searchHelper(CompressedTrieNode* node, const std::string& word, int index) {
if (index == word.length()) {
return node->isEndOfWord;
}
char firstChar = word[index];
if (node->children.find(firstChar) == node->children.end()) {
return false;
}
CompressedTrieNode* child = node->children[firstChar];
std::string& edgeLabel = child->edgeLabel;
if (word.substr(index, edgeLabel.length()) != edgeLabel) {
return false;
}
return searchHelper(child, word, index + edgeLabel.length());
}
};
Suffix trees are compressed tries of all suffixes of a string, enabling efficient substring searches and pattern matching.
#include
#include
#include
#include
#include
class Document {
public:
int id;
std::string title;
std::string content;
Document(int id, const std::string& title, const std::string& content)
: id(id), title(title), content(content) {}
};
class SearchEngine {
private:
std::vector documents;
Trie keywordTrie;
std::unordered_map> invertedIndex;
public:
void addDocument(const Document& doc) {
documents.push_back(doc);
indexDocument(doc);
}
private:
void indexDocument(const Document& doc) {
// Tokenize and index words
std::vector words = tokenize(doc.content);
for (const std::string& word : words) {
keywordTrie.insert(word);
invertedIndex[word].push_back(doc.id);
}
}
std::vector tokenize(const std::string& text) {
std::vector tokens;
std::string current = "";
for (char c : text) {
if (std::isalnum(c)) {
current += std::tolower(c);
} else if (!current.empty()) {
tokens.push_back(current);
current = "";
}
}
if (!current.empty()) {
tokens.push_back(current);
}
return tokens;
}
public:
std::vector search(const std::string& query) {
std::vector queryWords = tokenize(query);
std::vector results;
if (queryWords.empty()) return results;
// Get documents containing first word
if (invertedIndex.find(queryWords[0]) != invertedIndex.end()) {
results = invertedIndex[queryWords[0]];
}
// Intersect with documents containing other words
for (size_t i = 1; i < queryWords.size(); i++) {
if (invertedIndex.find(queryWords[i]) != invertedIndex.end()) {
std::vector temp;
std::set_intersection(
results.begin(), results.end(),
invertedIndex[queryWords[i]].begin(),
invertedIndex[queryWords[i]].end(),
std::back_inserter(temp)
);
results = temp;
} else {
results.clear();
break;
}
}
return results;
}
std::vector autoComplete(const std::string& prefix) {
return keywordTrie.autoComplete(prefix);
}
void displayResults(const std::vector& docIds) {
for (int id : docIds) {
for (const Document& doc : documents) {
if (doc.id == id) {
std::cout << "Document " << id << ": " << doc.title << std::endl;
break;
}
}
}
}
};
Algorithm | Preprocessing | Search Time | Space |
---|---|---|---|
Naive Search | O(1) | O(nm) | O(1) |
KMP | O(m) | O(n) | O(m) |
Trie | O(total chars) | O(m) | O(total chars) |
Suffix Tree | O(n) | O(m + k) | O(n) |
Where n = text length, m = pattern length, k = number of matches
#include
int main() {
std::cout << "=== Unit 3: String Algorithms Demo ===" << std::endl;
// KMP Algorithm Demo
std::cout << "\n--- KMP Algorithm ---" << std::endl;
std::string text = "ABABCABABA";
std::string pattern = "ABABA";
KMPAlgorithm::displayFailureFunction(pattern);
std::vector matches = KMPAlgorithm::search(text, pattern);
std::cout << "Text: " << text << std::endl;
std::cout << "Pattern: " << pattern << std::endl;
std::cout << "Matches found at positions: ";
for (int pos : matches) {
std::cout << pos << " ";
}
std::cout << std::endl;
// Trie Demo
std::cout << "\n--- Trie Demo ---" << std::endl;
Trie trie;
trie.insert("cat");
trie.insert("car");
trie.insert("card");
trie.insert("care");
trie.insert("careful");
trie.display();
std::cout << "Search 'car': " << (trie.search("car") ? "Found" : "Not found") << std::endl;
std::cout << "Search 'card': " << (trie.search("card") ? "Found" : "Not found") << std::endl;
std::cout << "Auto-complete for 'car': ";
std::vector suggestions = trie.autoComplete("car");
for (const std::string& word : suggestions) {
std::cout << word << " ";
}
std::cout << std::endl;
// Search Engine Demo
std::cout << "\n--- Search Engine Demo ---" << std::endl;
SearchEngine engine;
engine.addDocument(Document(1, "Introduction to Algorithms",
"algorithms data structures sorting searching"));
engine.addDocument(Document(2, "Database Systems",
"database sql queries indexing"));
engine.addDocument(Document(3, "Advanced Algorithms",
"algorithms graph theory dynamic programming"));
std::vector results = engine.search("algorithms");
std::cout << "Search results for 'algorithms':" << std::endl;
engine.displayResults(results);
return 0;
}