Unit 3: String Algorithms - Advanced Algorithms

Learning Objectives

Master string matching algorithms, particularly KMP
Understand and implement Trie data structures
Learn about compressed and suffix tries
Apply string algorithms to search engine design
Analyze time and space complexity of string algorithms

1. KMP (Knuth-Morris-Pratt) Algorithm

The KMP algorithm efficiently finds occurrences of a pattern within a text by utilizing information about the pattern itself to skip characters during the search.

Key Concepts:

Failure Function: Preprocessing to avoid redundant comparisons
Border: Proper prefix that is also a suffix
Time Complexity: O(n + m) where n is text length, m is pattern length

KMP Algorithm Visualization

Watch how KMP efficiently searches for patterns:

Text:

Pattern:

Click "Search" to start the KMP algorithm visualization

KMP Implementation

#include 
#include 
#include 

class KMPAlgorithm {
public:
    // Compute failure function (LPS array)
    static std::vector computeFailureFunction(const std::string& pattern) {
        int m = pattern.length();
        std::vector failure(m, 0);
        int j = 0;
        
        for (int i = 1; i < m; i++) {
            while (j > 0 && pattern[i] != pattern[j]) {
                j = failure[j - 1];
            }
            
            if (pattern[i] == pattern[j]) {
                j++;
            }
            
            failure[i] = j;
        }
        
        return failure;
    }
    
    // KMP search algorithm
    static std::vector search(const std::string& text, const std::string& pattern) {
        int n = text.length();
        int m = pattern.length();
        std::vector matches;
        
        if (m == 0) return matches;
        
        // Compute failure function
        std::vector failure = computeFailureFunction(pattern);
        
        int j = 0; // pattern index
        
        for (int i = 0; i < n; i++) {
            while (j > 0 && text[i] != pattern[j]) {
                j = failure[j - 1];
            }
            
            if (text[i] == pattern[j]) {
                j++;
            }
            
            if (j == m) {
                matches.push_back(i - m + 1);
                j = failure[j - 1];
            }
        }
        
        return matches;
    }
    
    // Display failure function
    static void displayFailureFunction(const std::string& pattern) {
        std::vector failure = computeFailureFunction(pattern);
        
        std::cout << "Pattern: ";
        for (char c : pattern) {
            std::cout << c << " ";
        }
        std::cout << std::endl;
        
        std::cout << "Failure: ";
        for (int f : failure) {
            std::cout << f << " ";
        }
        std::cout << std::endl;
    }
};

2. Trie Data Structure

A Trie (prefix tree) is a tree-like data structure used to store and search strings efficiently. Each node represents a character, and paths from root to leaves represent complete strings.

Trie Construction and Search

Build a trie and search for words:

Insert Word:

Search:

Insert words to build the trie structure

Trie Implementation

#include 
#include 
#include 
#include 

class TrieNode {
public:
    std::unordered_map children;
    bool isEndOfWord;
    
    TrieNode() : isEndOfWord(false) {}
    
    ~TrieNode() {
        for (auto& pair : children) {
            delete pair.second;
        }
    }
};

class Trie {
private:
    TrieNode* root;
    
public:
    Trie() {
        root = new TrieNode();
    }
    
    ~Trie() {
        delete root;
    }
    
    void insert(const std::string& word) {
        TrieNode* current = root;
        
        for (char c : word) {
            if (current->children.find(c) == current->children.end()) {
                current->children[c] = new TrieNode();
            }
            current = current->children[c];
        }
        
        current->isEndOfWord = true;
    }
    
    bool search(const std::string& word) {
        TrieNode* current = root;
        
        for (char c : word) {
            if (current->children.find(c) == current->children.end()) {
                return false;
            }
            current = current->children[c];
        }
        
        return current->isEndOfWord;
    }
    
    bool startsWith(const std::string& prefix) {
        TrieNode* current = root;
        
        for (char c : prefix) {
            if (current->children.find(c) == current->children.end()) {
                return false;
            }
            current = current->children[c];
        }
        
        return true;
    }
    
    std::vector autoComplete(const std::string& prefix) {
        std::vector results;
        TrieNode* current = root;
        
        // Navigate to prefix node
        for (char c : prefix) {
            if (current->children.find(c) == current->children.end()) {
                return results; // Prefix not found
            }
            current = current->children[c];
        }
        
        // Collect all words with this prefix
        collectWords(current, prefix, results);
        return results;
    }
    
private:
    void collectWords(TrieNode* node, const std::string& currentWord, 
                     std::vector& results) {
        if (node->isEndOfWord) {
            results.push_back(currentWord);
        }
        
        for (auto& pair : node->children) {
            collectWords(pair.second, currentWord + pair.first, results);
        }
    }
    
public:
    void display() {
        std::cout << "Trie contents:" << std::endl;
        displayHelper(root, "");
    }
    
private:
    void displayHelper(TrieNode* node, const std::string& prefix) {
        if (node->isEndOfWord) {
            std::cout << prefix << std::endl;
        }
        
        for (auto& pair : node->children) {
            displayHelper(pair.second, prefix + pair.first);
        }
    }
};

3. Compressed Tries (Radix Trees)

Compressed tries optimize space by merging nodes with single children, storing edge labels instead of individual characters.

Advantages of Compressed Tries:

Space Efficient: Reduces number of nodes
Faster Traversal: Fewer nodes to visit
Edge Labels: Store substrings instead of single characters

Compressed Trie Implementation

#include 
#include 
#include 

class CompressedTrieNode {
public:
    std::unordered_map children;
    std::string edgeLabel;
    bool isEndOfWord;
    
    CompressedTrieNode(const std::string& label = "") 
        : edgeLabel(label), isEndOfWord(false) {}
    
    ~CompressedTrieNode() {
        for (auto& pair : children) {
            delete pair.second;
        }
    }
};

class CompressedTrie {
private:
    CompressedTrieNode* root;
    
public:
    CompressedTrie() {
        root = new CompressedTrieNode();
    }
    
    ~CompressedTrie() {
        delete root;
    }
    
    void insert(const std::string& word) {
        insertHelper(root, word, 0);
    }
    
private:
    void insertHelper(CompressedTrieNode* node, const std::string& word, int index) {
        if (index == word.length()) {
            node->isEndOfWord = true;
            return;
        }
        
        char firstChar = word[index];
        
        if (node->children.find(firstChar) == node->children.end()) {
            // Create new node with remaining suffix
            node->children[firstChar] = 
                new CompressedTrieNode(word.substr(index));
            node->children[firstChar]->isEndOfWord = true;
            return;
        }
        
        CompressedTrieNode* child = node->children[firstChar];
        std::string& edgeLabel = child->edgeLabel;
        
        // Find common prefix
        int commonLen = 0;
        int minLen = std::min(edgeLabel.length(), word.length() - index);
        
        while (commonLen < minLen && 
               edgeLabel[commonLen] == word[index + commonLen]) {
            commonLen++;
        }
        
        if (commonLen == edgeLabel.length()) {
            // Edge label is prefix of remaining word
            insertHelper(child, word, index + commonLen);
        } else {
            // Split the edge
            CompressedTrieNode* newNode = 
                new CompressedTrieNode(edgeLabel.substr(commonLen));
            newNode->children = std::move(child->children);
            newNode->isEndOfWord = child->isEndOfWord;
            
            child->children.clear();
            child->children[edgeLabel[commonLen]] = newNode;
            child->edgeLabel = edgeLabel.substr(0, commonLen);
            child->isEndOfWord = (index + commonLen == word.length());
            
            if (index + commonLen < word.length()) {
                char nextChar = word[index + commonLen];
                child->children[nextChar] = 
                    new CompressedTrieNode(word.substr(index + commonLen));
                child->children[nextChar]->isEndOfWord = true;
            }
        }
    }
    
public:
    bool search(const std::string& word) {
        return searchHelper(root, word, 0);
    }
    
private:
    bool searchHelper(CompressedTrieNode* node, const std::string& word, int index) {
        if (index == word.length()) {
            return node->isEndOfWord;
        }
        
        char firstChar = word[index];
        
        if (node->children.find(firstChar) == node->children.end()) {
            return false;
        }
        
        CompressedTrieNode* child = node->children[firstChar];
        std::string& edgeLabel = child->edgeLabel;
        
        if (word.substr(index, edgeLabel.length()) != edgeLabel) {
            return false;
        }
        
        return searchHelper(child, word, index + edgeLabel.length());
    }
};

4. Suffix Trees and Search Engines

Suffix trees are compressed tries of all suffixes of a string, enabling efficient substring searches and pattern matching.

Search Engine Applications

1 Indexing: Build tries/suffix trees for document collections

2 Query Processing: Use string matching for keyword search

3 Auto-completion: Implement prefix-based suggestions

4 Ranking: Score documents based on pattern occurrences

Simple Search Engine Implementation

#include 
#include 
#include 
#include 
#include 

class Document {
public:
    int id;
    std::string title;
    std::string content;
    
    Document(int id, const std::string& title, const std::string& content)
        : id(id), title(title), content(content) {}
};

class SearchEngine {
private:
    std::vector documents;
    Trie keywordTrie;
    std::unordered_map> invertedIndex;
    
public:
    void addDocument(const Document& doc) {
        documents.push_back(doc);
        indexDocument(doc);
    }
    
private:
    void indexDocument(const Document& doc) {
        // Tokenize and index words
        std::vector words = tokenize(doc.content);
        
        for (const std::string& word : words) {
            keywordTrie.insert(word);
            invertedIndex[word].push_back(doc.id);
        }
    }
    
    std::vector tokenize(const std::string& text) {
        std::vector tokens;
        std::string current = "";
        
        for (char c : text) {
            if (std::isalnum(c)) {
                current += std::tolower(c);
            } else if (!current.empty()) {
                tokens.push_back(current);
                current = "";
            }
        }
        
        if (!current.empty()) {
            tokens.push_back(current);
        }
        
        return tokens;
    }
    
public:
    std::vector search(const std::string& query) {
        std::vector queryWords = tokenize(query);
        std::vector results;
        
        if (queryWords.empty()) return results;
        
        // Get documents containing first word
        if (invertedIndex.find(queryWords[0]) != invertedIndex.end()) {
            results = invertedIndex[queryWords[0]];
        }
        
        // Intersect with documents containing other words
        for (size_t i = 1; i < queryWords.size(); i++) {
            if (invertedIndex.find(queryWords[i]) != invertedIndex.end()) {
                std::vector temp;
                std::set_intersection(
                    results.begin(), results.end(),
                    invertedIndex[queryWords[i]].begin(),
                    invertedIndex[queryWords[i]].end(),
                    std::back_inserter(temp)
                );
                results = temp;
            } else {
                results.clear();
                break;
            }
        }
        
        return results;
    }
    
    std::vector autoComplete(const std::string& prefix) {
        return keywordTrie.autoComplete(prefix);
    }
    
    void displayResults(const std::vector& docIds) {
        for (int id : docIds) {
            for (const Document& doc : documents) {
                if (doc.id == id) {
                    std::cout << "Document " << id << ": " << doc.title << std::endl;
                    break;
                }
            }
        }
    }
};

5. Performance Analysis

Algorithm	Preprocessing	Search Time	Space
Naive Search	O(1)	O(nm)	O(1)
KMP	O(m)	O(n)	O(m)
Trie	O(total chars)	O(m)	O(total chars)
Suffix Tree	O(n)	O(m + k)	O(n)

Where n = text length, m = pattern length, k = number of matches

6. Complete Example Program

#include 

int main() {
    std::cout << "=== Unit 3: String Algorithms Demo ===" << std::endl;
    
    // KMP Algorithm Demo
    std::cout << "\n--- KMP Algorithm ---" << std::endl;
    std::string text = "ABABCABABA";
    std::string pattern = "ABABA";
    
    KMPAlgorithm::displayFailureFunction(pattern);
    std::vector matches = KMPAlgorithm::search(text, pattern);
    
    std::cout << "Text: " << text << std::endl;
    std::cout << "Pattern: " << pattern << std::endl;
    std::cout << "Matches found at positions: ";
    for (int pos : matches) {
        std::cout << pos << " ";
    }
    std::cout << std::endl;
    
    // Trie Demo
    std::cout << "\n--- Trie Demo ---" << std::endl;
    Trie trie;
    
    trie.insert("cat");
    trie.insert("car");
    trie.insert("card");
    trie.insert("care");
    trie.insert("careful");
    
    trie.display();
    
    std::cout << "Search 'car': " << (trie.search("car") ? "Found" : "Not found") << std::endl;
    std::cout << "Search 'card': " << (trie.search("card") ? "Found" : "Not found") << std::endl;
    
    std::cout << "Auto-complete for 'car': ";
    std::vector suggestions = trie.autoComplete("car");
    for (const std::string& word : suggestions) {
        std::cout << word << " ";
    }
    std::cout << std::endl;
    
    // Search Engine Demo
    std::cout << "\n--- Search Engine Demo ---" << std::endl;
    SearchEngine engine;
    
    engine.addDocument(Document(1, "Introduction to Algorithms", 
                              "algorithms data structures sorting searching"));
    engine.addDocument(Document(2, "Database Systems", 
                              "database sql queries indexing"));
    engine.addDocument(Document(3, "Advanced Algorithms", 
                              "algorithms graph theory dynamic programming"));
    
    std::vector results = engine.search("algorithms");
    std::cout << "Search results for 'algorithms':" << std::endl;
    engine.displayResults(results);
    
    return 0;
}