Skip to content

Memory Management Best Practices

Learn advanced techniques for efficient memory management in HPC applications, focusing on NUMA awareness, cache optimization, and memory bandwidth utilization.

Memory Architecture Overview

NUMA Considerations

// NUMA-aware allocation
#include <numa.h>

void* allocate_numa_memory(size_t size, int node) {
    void* ptr = numa_alloc_onnode(size, node);
    if (!ptr) {
        throw std::runtime_error("NUMA allocation failed");
    }
    return ptr;
}

Memory Hierarchy

graph TD
    L1[L1 Cache] --> L2[L2 Cache]
    L2 --> L3[L3 Cache]
    L3 --> RAM[Main Memory]
    RAM --> NVME[NVMe Storage]

Best Practices

1. Memory Alignment

// Aligned memory allocation
void* aligned_alloc(size_t alignment, size_t size) {
    void* ptr = nullptr;
    int ret = posix_memalign(&ptr, alignment, size);
    if (ret != 0) {
        return nullptr;
    }
    return ptr;
}

// Usage example
float* data = (float*)aligned_alloc(64, n * sizeof(float));

2. Cache-Friendly Access Patterns

// Bad: Cache thrashing
for(int j = 0; j < n; j++)
    for(int i = 0; i < n; i++)
        matrix[i][j] = compute(i, j);

// Good: Cache-friendly
for(int i = 0; i < n; i++)
    for(int j = 0; j < n; j++)
        matrix[i][j] = compute(i, j);

3. Memory Pooling

template<typename T, size_t BlockSize>
class MemoryPool {
private:
    std::vector<T*> blocks;
    std::vector<T*> free_list;

public:
    T* allocate() {
        if (free_list.empty()) {
            // Allocate new block
            T* block = static_cast<T*>(
                aligned_alloc(64, BlockSize * sizeof(T))
            );
            blocks.push_back(block);

            // Initialize free list
            for(size_t i = 0; i < BlockSize; i++) {
                free_list.push_back(block + i);
            }
        }

        T* ptr = free_list.back();
        free_list.pop_back();
        return ptr;
    }
};

Performance Optimization

1. NUMA Binding

# Bind processes to NUMA nodes
numactl --membind=0 ./your_app    # Memory on node 0
numactl --cpunodebind=0 ./your_app  # CPUs on node 0

2. Memory Bandwidth

// Stream benchmark implementation
void stream_triad(double* a, double* b, double* c, 
                 const double scalar, size_t n) {
    #pragma omp parallel for
    for(size_t i = 0; i < n; i++) {
        a[i] = b[i] + scalar * c[i];
    }
}

3. Memory Prefetching

// Software prefetching
#include <xmmintrin.h>

void prefetch_array(float* arr, int n) {
    for(int i = 0; i < n; i += 16) {
        _mm_prefetch((char*)&arr[i + 16], _MM_HINT_T0);
        // Process current elements
        process_elements(&arr[i]);
    }
}

Common Pitfalls

Memory Fragmentation

Problem: Frequent allocations/deallocations Solution: Use memory pools or custom allocators

// Custom allocator
template<typename T>
class PoolAllocator {
    MemoryPool<T, 1024> pool;
public:
    T* allocate(size_t n) {
        return pool.allocate();
    }
};

False Sharing

Problem: Cache line contention Solution: Padding and alignment

struct alignas(64) ThreadData {
    double value;
    char padding[56]; // Ensure 64-byte alignment
};

Monitoring and Profiling

Memory Usage Analysis

# Monitor memory bandwidth
likwid-perfctr -C 0-11 -g MEM ./your_app

# Memory access pattern analysis
valgrind --tool=cachegrind ./your_app

Performance Metrics

Metric Target Impact
Cache Miss Rate <5% Critical
Memory Bandwidth >80% Peak High
NUMA Local Access >90% High
Page Faults Minimal Medium

System-Specific Optimizations

# AMD EPYC optimization
module load rocm
export HSA_ENABLE_SDMA=0
# Intel optimization
module load intel
export KMP_AFFINITY=compact
# ARM optimization
module load arm-forge
export ARMPL_MEMORY_ADVISE=1

References

  1. NUMA Programming Guide
  2. Intel Memory Management Guide
  3. AMD EPYC Memory Guide