Skip to content

Replace linear symbol lookup with hash table for O(1) performance #294

@jserv

Description

@jserv

Description

Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.

Current Implementation

// src/globals.c - Current linear search
symbol_t *find_symbol(char *name) {
    for (symbol_t *sym = symbols; sym; sym = sym->next) {
        if (strcmp(sym->name, name) == 0)
            return sym;
    }
    return NULL;
}

Performance profile shows:

  • 10,000 symbols: ~350ms lookup time (35% of total)
  • 1,000 symbols: ~35ms lookup time (20% of total)
  • 100 symbols: ~3ms lookup time (5% of total)

Proposed Hash Table Implementation

1. Data Structures

// Add to src/defs.h
#define SYMBOL_HASH_SIZE 1021  // Prime number for better distribution

typedef struct symbol_entry {
    symbol_t *symbol;
    struct symbol_entry *next;  // Chain for collision resolution
} symbol_entry_t;

typedef struct {
    symbol_entry_t *buckets[SYMBOL_HASH_SIZE];
    int total_symbols;
    int max_chain_length;  // For statistics
    
    // Statistics
    int lookups;
    int collisions;
    int hits;
    int misses;
} symbol_table_t;

// Global symbol tables
typedef struct {
    symbol_table_t *global_symbols;
    symbol_table_t *local_symbols;  // Current function scope
    symbol_table_t *types;          // Type definitions
} symbol_context_t;

2. Hash Function

// src/symbol_table.c - New file
// DJB2 hash function - simple and effective
uint32_t hash_string(const char *str) {
    uint32_t hash = 5381;
    int c;
    
    while ((c = *str++)) {
        hash = ((hash << 5) + hash) + c;  // hash * 33 + c
    }
    
    return hash;
}

int get_bucket_index(const char *name) {
    return hash_string(name) % SYMBOL_HASH_SIZE;
}

3. Symbol Table Operations

symbol_table_t *create_symbol_table(void) {
    symbol_table_t *table = calloc(1, sizeof(symbol_table_t));
    return table;
}

void insert_symbol(symbol_table_t *table, symbol_t *symbol) {
    int index = get_bucket_index(symbol->name);
    
    // Create new entry
    symbol_entry_t *entry = malloc(sizeof(symbol_entry_t));
    entry->symbol = symbol;
    
    // Insert at head of chain (O(1) insertion)
    entry->next = table->buckets[index];
    table->buckets[index] = entry;
    
    // Update statistics
    table->total_symbols++;
    if (entry->next) {
        table->collisions++;
    }
    
    // Track max chain length
    int chain_len = 0;
    for (symbol_entry_t *e = entry; e; e = e->next) {
        chain_len++;
    }
    if (chain_len > table->max_chain_length) {
        table->max_chain_length = chain_len;
    }
}

symbol_t *lookup_symbol(symbol_table_t *table, const char *name) {
    table->lookups++;
    
    int index = get_bucket_index(name);
    symbol_entry_t *entry = table->buckets[index];
    
    // Search chain
    while (entry) {
        if (strcmp(entry->symbol->name, name) == 0) {
            table->hits++;
            return entry->symbol;
        }
        entry = entry->next;
    }
    
    table->misses++;
    return NULL;
}

// Replace global find_symbol
symbol_t *find_symbol_fast(const char *name) {
    symbol_t *sym = NULL;
    
    // Check local scope first
    if (current_context->local_symbols) {
        sym = lookup_symbol(current_context->local_symbols, name);
        if (sym) return sym;
    }
    
    // Check global scope
    return lookup_symbol(current_context->global_symbols, name);
}

4. Dynamic Resizing (Optional Enhancement)

void resize_symbol_table(symbol_table_t *table) {
    // Only resize if load factor > 0.75
    float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE;
    if (load_factor < 0.75) return;
    
    // Save old buckets
    symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE];
    memcpy(old_buckets, table->buckets, sizeof(old_buckets));
    
    // Clear table
    memset(table->buckets, 0, sizeof(table->buckets));
    table->total_symbols = 0;
    table->collisions = 0;
    
    // Rehash all symbols with new size
    for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
        symbol_entry_t *entry = old_buckets[i];
        while (entry) {
            symbol_entry_t *next = entry->next;
            entry->next = NULL;
            
            // Reinsert with new hash
            int new_index = get_bucket_index(entry->symbol->name);
            entry->next = table->buckets[new_index];
            table->buckets[new_index] = entry;
            table->total_symbols++;
            
            entry = next;
        }
    }
}

5. Integration Points

// Update src/parser.c
void enter_scope(void) {
    // Create new local symbol table
    current_context->local_symbols = create_symbol_table();
}

void exit_scope(void) {
    // Print statistics if verbose
    if (verbose_mode) {
        print_symbol_table_stats(current_context->local_symbols);
    }
    
    // Free local symbol table
    free_symbol_table(current_context->local_symbols);
    current_context->local_symbols = NULL;
}

// Update symbol creation
symbol_t *create_symbol(const char *name, type_t *type) {
    symbol_t *sym = malloc(sizeof(symbol_t));
    sym->name = strdup(name);
    sym->type = type;
    
    // Insert into appropriate table
    if (current_scope == SCOPE_GLOBAL) {
        insert_symbol(current_context->global_symbols, sym);
    } else {
        insert_symbol(current_context->local_symbols, sym);
    }
    
    return sym;
}

6. Performance Monitoring

void print_symbol_table_stats(symbol_table_t *table) {
    printf("=== Symbol Table Statistics ===\n");
    printf("Total symbols:     %d\n", table->total_symbols);
    printf("Hash buckets:      %d\n", SYMBOL_HASH_SIZE);
    printf("Load factor:       %.2f\n", 
           (float)table->total_symbols / SYMBOL_HASH_SIZE);
    printf("Collisions:        %d (%.1f%%)\n", 
           table->collisions,
           table->collisions * 100.0 / table->total_symbols);
    printf("Max chain length:  %d\n", table->max_chain_length);
    printf("Total lookups:     %d\n", table->lookups);
    printf("Hit rate:          %.1f%%\n", 
           table->hits * 100.0 / table->lookups);
    
    // Distribution analysis
    int empty = 0, chains = 0;
    for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
        if (!table->buckets[i]) {
            empty++;
        } else if (table->buckets[i]->next) {
            chains++;
        }
    }
    printf("Empty buckets:     %d (%.1f%%)\n", 
           empty, empty * 100.0 / SYMBOL_HASH_SIZE);
    printf("Chains:            %d\n", chains);
}

Testing

Performance Benchmark

// tests/perf/symbol_lookup.c
#define NUM_SYMBOLS 10000

void benchmark_symbol_lookup() {
    clock_t start, end;
    
    // Create many symbols
    for (int i = 0; i < NUM_SYMBOLS; i++) {
        char name[32];
        sprintf(name, "symbol_%d", i);
        create_symbol(name, &type_int);
    }
    
    // Benchmark lookups
    start = clock();
    for (int i = 0; i < NUM_SYMBOLS * 10; i++) {
        char name[32];
        sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS);
        find_symbol_fast(name);
    }
    end = clock();
    
    double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("100,000 lookups in %.3f seconds\n", time_spent);
    printf("Average: %.3f microseconds per lookup\n", 
           time_spent * 1000000 / (NUM_SYMBOLS * 10));
}

Correctness Tests

// Verify hash table behaves identically to linear search
void test_symbol_table_correctness() {
    // Test insertion and lookup
    symbol_t *s1 = create_symbol("test1", &type_int);
    symbol_t *s2 = create_symbol("test2", &type_char);
    
    assert(find_symbol_fast("test1") == s1);
    assert(find_symbol_fast("test2") == s2);
    assert(find_symbol_fast("nonexistent") == NULL);
    
    // Test collision handling
    // Force collisions by creating symbols with same hash
    for (int i = 0; i < 100; i++) {
        char name[32];
        sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
        create_symbol(name, &type_int);
    }
    
    // Verify all can be found
    for (int i = 0; i < 100; i++) {
        char name[32];
        sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
        assert(find_symbol_fast(name) != NULL);
    }
}

Migration Plan

  1. Phase 1: Implement hash table alongside existing code
  2. Phase 2: Add compatibility layer with fallback
  3. Phase 3: Migrate all symbol lookups
  4. Phase 4: Remove old linear search code

Success Criteria

  • O(1) average lookup time
  • No functionality regression
  • 30%+ speedup for large programs
  • Hash collision rate < 10%
  • All existing tests pass

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions