Unit 3: String Algorithms

KMP Algorithm, Tries, Suffix Trees, Search Engines

Duration: 8 Hours

Learning Objectives

1. KMP (Knuth-Morris-Pratt) Algorithm

The KMP algorithm efficiently finds occurrences of a pattern within a text by utilizing information about the pattern itself to skip characters during the search.

Key Concepts:

  • Failure Function: Preprocessing to avoid redundant comparisons
  • Border: Proper prefix that is also a suffix
  • Time Complexity: O(n + m) where n is text length, m is pattern length

KMP Algorithm Visualization

Watch how KMP efficiently searches for patterns:

Click "Search" to start the KMP algorithm visualization

KMP Implementation

#include 
#include 
#include 

class KMPAlgorithm {
public:
    // Compute failure function (LPS array)
    static std::vector computeFailureFunction(const std::string& pattern) {
        int m = pattern.length();
        std::vector failure(m, 0);
        int j = 0;
        
        for (int i = 1; i < m; i++) {
            while (j > 0 && pattern[i] != pattern[j]) {
                j = failure[j - 1];
            }
            
            if (pattern[i] == pattern[j]) {
                j++;
            }
            
            failure[i] = j;
        }
        
        return failure;
    }
    
    // KMP search algorithm
    static std::vector search(const std::string& text, const std::string& pattern) {
        int n = text.length();
        int m = pattern.length();
        std::vector matches;
        
        if (m == 0) return matches;
        
        // Compute failure function
        std::vector failure = computeFailureFunction(pattern);
        
        int j = 0; // pattern index
        
        for (int i = 0; i < n; i++) {
            while (j > 0 && text[i] != pattern[j]) {
                j = failure[j - 1];
            }
            
            if (text[i] == pattern[j]) {
                j++;
            }
            
            if (j == m) {
                matches.push_back(i - m + 1);
                j = failure[j - 1];
            }
        }
        
        return matches;
    }
    
    // Display failure function
    static void displayFailureFunction(const std::string& pattern) {
        std::vector failure = computeFailureFunction(pattern);
        
        std::cout << "Pattern: ";
        for (char c : pattern) {
            std::cout << c << " ";
        }
        std::cout << std::endl;
        
        std::cout << "Failure: ";
        for (int f : failure) {
            std::cout << f << " ";
        }
        std::cout << std::endl;
    }
};

2. Trie Data Structure

A Trie (prefix tree) is a tree-like data structure used to store and search strings efficiently. Each node represents a character, and paths from root to leaves represent complete strings.

Trie Construction and Search

Build a trie and search for words:

Insert words to build the trie structure

Trie Implementation

#include 
#include 
#include 
#include 

class TrieNode {
public:
    std::unordered_map children;
    bool isEndOfWord;
    
    TrieNode() : isEndOfWord(false) {}
    
    ~TrieNode() {
        for (auto& pair : children) {
            delete pair.second;
        }
    }
};

class Trie {
private:
    TrieNode* root;
    
public:
    Trie() {
        root = new TrieNode();
    }
    
    ~Trie() {
        delete root;
    }
    
    void insert(const std::string& word) {
        TrieNode* current = root;
        
        for (char c : word) {
            if (current->children.find(c) == current->children.end()) {
                current->children[c] = new TrieNode();
            }
            current = current->children[c];
        }
        
        current->isEndOfWord = true;
    }
    
    bool search(const std::string& word) {
        TrieNode* current = root;
        
        for (char c : word) {
            if (current->children.find(c) == current->children.end()) {
                return false;
            }
            current = current->children[c];
        }
        
        return current->isEndOfWord;
    }
    
    bool startsWith(const std::string& prefix) {
        TrieNode* current = root;
        
        for (char c : prefix) {
            if (current->children.find(c) == current->children.end()) {
                return false;
            }
            current = current->children[c];
        }
        
        return true;
    }
    
    std::vector autoComplete(const std::string& prefix) {
        std::vector results;
        TrieNode* current = root;
        
        // Navigate to prefix node
        for (char c : prefix) {
            if (current->children.find(c) == current->children.end()) {
                return results; // Prefix not found
            }
            current = current->children[c];
        }
        
        // Collect all words with this prefix
        collectWords(current, prefix, results);
        return results;
    }
    
private:
    void collectWords(TrieNode* node, const std::string& currentWord, 
                     std::vector& results) {
        if (node->isEndOfWord) {
            results.push_back(currentWord);
        }
        
        for (auto& pair : node->children) {
            collectWords(pair.second, currentWord + pair.first, results);
        }
    }
    
public:
    void display() {
        std::cout << "Trie contents:" << std::endl;
        displayHelper(root, "");
    }
    
private:
    void displayHelper(TrieNode* node, const std::string& prefix) {
        if (node->isEndOfWord) {
            std::cout << prefix << std::endl;
        }
        
        for (auto& pair : node->children) {
            displayHelper(pair.second, prefix + pair.first);
        }
    }
};

3. Compressed Tries (Radix Trees)

Compressed tries optimize space by merging nodes with single children, storing edge labels instead of individual characters.

Advantages of Compressed Tries:

  • Space Efficient: Reduces number of nodes
  • Faster Traversal: Fewer nodes to visit
  • Edge Labels: Store substrings instead of single characters

Compressed Trie Implementation

#include 
#include 
#include 

class CompressedTrieNode {
public:
    std::unordered_map children;
    std::string edgeLabel;
    bool isEndOfWord;
    
    CompressedTrieNode(const std::string& label = "") 
        : edgeLabel(label), isEndOfWord(false) {}
    
    ~CompressedTrieNode() {
        for (auto& pair : children) {
            delete pair.second;
        }
    }
};

class CompressedTrie {
private:
    CompressedTrieNode* root;
    
public:
    CompressedTrie() {
        root = new CompressedTrieNode();
    }
    
    ~CompressedTrie() {
        delete root;
    }
    
    void insert(const std::string& word) {
        insertHelper(root, word, 0);
    }
    
private:
    void insertHelper(CompressedTrieNode* node, const std::string& word, int index) {
        if (index == word.length()) {
            node->isEndOfWord = true;
            return;
        }
        
        char firstChar = word[index];
        
        if (node->children.find(firstChar) == node->children.end()) {
            // Create new node with remaining suffix
            node->children[firstChar] = 
                new CompressedTrieNode(word.substr(index));
            node->children[firstChar]->isEndOfWord = true;
            return;
        }
        
        CompressedTrieNode* child = node->children[firstChar];
        std::string& edgeLabel = child->edgeLabel;
        
        // Find common prefix
        int commonLen = 0;
        int minLen = std::min(edgeLabel.length(), word.length() - index);
        
        while (commonLen < minLen && 
               edgeLabel[commonLen] == word[index + commonLen]) {
            commonLen++;
        }
        
        if (commonLen == edgeLabel.length()) {
            // Edge label is prefix of remaining word
            insertHelper(child, word, index + commonLen);
        } else {
            // Split the edge
            CompressedTrieNode* newNode = 
                new CompressedTrieNode(edgeLabel.substr(commonLen));
            newNode->children = std::move(child->children);
            newNode->isEndOfWord = child->isEndOfWord;
            
            child->children.clear();
            child->children[edgeLabel[commonLen]] = newNode;
            child->edgeLabel = edgeLabel.substr(0, commonLen);
            child->isEndOfWord = (index + commonLen == word.length());
            
            if (index + commonLen < word.length()) {
                char nextChar = word[index + commonLen];
                child->children[nextChar] = 
                    new CompressedTrieNode(word.substr(index + commonLen));
                child->children[nextChar]->isEndOfWord = true;
            }
        }
    }
    
public:
    bool search(const std::string& word) {
        return searchHelper(root, word, 0);
    }
    
private:
    bool searchHelper(CompressedTrieNode* node, const std::string& word, int index) {
        if (index == word.length()) {
            return node->isEndOfWord;
        }
        
        char firstChar = word[index];
        
        if (node->children.find(firstChar) == node->children.end()) {
            return false;
        }
        
        CompressedTrieNode* child = node->children[firstChar];
        std::string& edgeLabel = child->edgeLabel;
        
        if (word.substr(index, edgeLabel.length()) != edgeLabel) {
            return false;
        }
        
        return searchHelper(child, word, index + edgeLabel.length());
    }
};

4. Suffix Trees and Search Engines

Suffix trees are compressed tries of all suffixes of a string, enabling efficient substring searches and pattern matching.

Search Engine Applications

1 Indexing: Build tries/suffix trees for document collections
2 Query Processing: Use string matching for keyword search
3 Auto-completion: Implement prefix-based suggestions
4 Ranking: Score documents based on pattern occurrences

Simple Search Engine Implementation

#include 
#include 
#include 
#include 
#include 

class Document {
public:
    int id;
    std::string title;
    std::string content;
    
    Document(int id, const std::string& title, const std::string& content)
        : id(id), title(title), content(content) {}
};

class SearchEngine {
private:
    std::vector documents;
    Trie keywordTrie;
    std::unordered_map> invertedIndex;
    
public:
    void addDocument(const Document& doc) {
        documents.push_back(doc);
        indexDocument(doc);
    }
    
private:
    void indexDocument(const Document& doc) {
        // Tokenize and index words
        std::vector words = tokenize(doc.content);
        
        for (const std::string& word : words) {
            keywordTrie.insert(word);
            invertedIndex[word].push_back(doc.id);
        }
    }
    
    std::vector tokenize(const std::string& text) {
        std::vector tokens;
        std::string current = "";
        
        for (char c : text) {
            if (std::isalnum(c)) {
                current += std::tolower(c);
            } else if (!current.empty()) {
                tokens.push_back(current);
                current = "";
            }
        }
        
        if (!current.empty()) {
            tokens.push_back(current);
        }
        
        return tokens;
    }
    
public:
    std::vector search(const std::string& query) {
        std::vector queryWords = tokenize(query);
        std::vector results;
        
        if (queryWords.empty()) return results;
        
        // Get documents containing first word
        if (invertedIndex.find(queryWords[0]) != invertedIndex.end()) {
            results = invertedIndex[queryWords[0]];
        }
        
        // Intersect with documents containing other words
        for (size_t i = 1; i < queryWords.size(); i++) {
            if (invertedIndex.find(queryWords[i]) != invertedIndex.end()) {
                std::vector temp;
                std::set_intersection(
                    results.begin(), results.end(),
                    invertedIndex[queryWords[i]].begin(),
                    invertedIndex[queryWords[i]].end(),
                    std::back_inserter(temp)
                );
                results = temp;
            } else {
                results.clear();
                break;
            }
        }
        
        return results;
    }
    
    std::vector autoComplete(const std::string& prefix) {
        return keywordTrie.autoComplete(prefix);
    }
    
    void displayResults(const std::vector& docIds) {
        for (int id : docIds) {
            for (const Document& doc : documents) {
                if (doc.id == id) {
                    std::cout << "Document " << id << ": " << doc.title << std::endl;
                    break;
                }
            }
        }
    }
};

5. Performance Analysis

Algorithm Preprocessing Search Time Space
Naive Search O(1) O(nm) O(1)
KMP O(m) O(n) O(m)
Trie O(total chars) O(m) O(total chars)
Suffix Tree O(n) O(m + k) O(n)

Where n = text length, m = pattern length, k = number of matches

6. Complete Example Program

#include 

int main() {
    std::cout << "=== Unit 3: String Algorithms Demo ===" << std::endl;
    
    // KMP Algorithm Demo
    std::cout << "\n--- KMP Algorithm ---" << std::endl;
    std::string text = "ABABCABABA";
    std::string pattern = "ABABA";
    
    KMPAlgorithm::displayFailureFunction(pattern);
    std::vector matches = KMPAlgorithm::search(text, pattern);
    
    std::cout << "Text: " << text << std::endl;
    std::cout << "Pattern: " << pattern << std::endl;
    std::cout << "Matches found at positions: ";
    for (int pos : matches) {
        std::cout << pos << " ";
    }
    std::cout << std::endl;
    
    // Trie Demo
    std::cout << "\n--- Trie Demo ---" << std::endl;
    Trie trie;
    
    trie.insert("cat");
    trie.insert("car");
    trie.insert("card");
    trie.insert("care");
    trie.insert("careful");
    
    trie.display();
    
    std::cout << "Search 'car': " << (trie.search("car") ? "Found" : "Not found") << std::endl;
    std::cout << "Search 'card': " << (trie.search("card") ? "Found" : "Not found") << std::endl;
    
    std::cout << "Auto-complete for 'car': ";
    std::vector suggestions = trie.autoComplete("car");
    for (const std::string& word : suggestions) {
        std::cout << word << " ";
    }
    std::cout << std::endl;
    
    // Search Engine Demo
    std::cout << "\n--- Search Engine Demo ---" << std::endl;
    SearchEngine engine;
    
    engine.addDocument(Document(1, "Introduction to Algorithms", 
                              "algorithms data structures sorting searching"));
    engine.addDocument(Document(2, "Database Systems", 
                              "database sql queries indexing"));
    engine.addDocument(Document(3, "Advanced Algorithms", 
                              "algorithms graph theory dynamic programming"));
    
    std::vector results = engine.search("algorithms");
    std::cout << "Search results for 'algorithms':" << std::endl;
    engine.displayResults(results);
    
    return 0;
}