Memory Management Best Practices

Learn advanced techniques for efficient memory management in HPC applications, focusing on NUMA awareness, cache optimization, and memory bandwidth utilization.

Memory Architecture Overview

NUMA Considerations

// NUMA-aware allocation
#include <numa.h>

void* allocate_numa_memory(size_t size, int node) {
    void* ptr = numa_alloc_onnode(size, node);
    if (!ptr) {
        throw std::runtime_error("NUMA allocation failed");
    }
    return ptr;
}

Memory Hierarchy

graph TD
    L1[L1 Cache] --> L2[L2 Cache]
    L2 --> L3[L3 Cache]
    L3 --> RAM[Main Memory]
    RAM --> NVME[NVMe Storage]

Best Practices

1. Memory Alignment

// Aligned memory allocation
void* aligned_alloc(size_t alignment, size_t size) {
    void* ptr = nullptr;
    int ret = posix_memalign(&ptr, alignment, size);
    if (ret != 0) {
        return nullptr;
    }
    return ptr;
}

// Usage example
float* data = (float*)aligned_alloc(64, n * sizeof(float));

2. Cache-Friendly Access Patterns

// Bad: Cache thrashing
for(int j = 0; j < n; j++)
    for(int i = 0; i < n; i++)
        matrix[i][j] = compute(i, j);

// Good: Cache-friendly
for(int i = 0; i < n; i++)
    for(int j = 0; j < n; j++)
        matrix[i][j] = compute(i, j);

3. Memory Pooling

template<typename T, size_t BlockSize>
class MemoryPool {
private:
    std::vector<T*> blocks;
    std::vector<T*> free_list;

public:
    T* allocate() {
        if (free_list.empty()) {
            // Allocate new block
            T* block = static_cast<T*>(
                aligned_alloc(64, BlockSize * sizeof(T))
            );
            blocks.push_back(block);

            // Initialize free list
            for(size_t i = 0; i < BlockSize; i++) {
                free_list.push_back(block + i);
            }
        }

        T* ptr = free_list.back();
        free_list.pop_back();
        return ptr;
    }
};

Performance Optimization

1. NUMA Binding

# Bind processes to NUMA nodes
numactl --membind=0 ./your_app    # Memory on node 0
numactl --cpunodebind=0 ./your_app  # CPUs on node 0

2. Memory Bandwidth

// Stream benchmark implementation
void stream_triad(double* a, double* b, double* c, 
                 const double scalar, size_t n) {
    #pragma omp parallel for
    for(size_t i = 0; i < n; i++) {
        a[i] = b[i] + scalar * c[i];
    }
}

3. Memory Prefetching

// Software prefetching
#include <xmmintrin.h>

void prefetch_array(float* arr, int n) {
    for(int i = 0; i < n; i += 16) {
        _mm_prefetch((char*)&arr[i + 16], _MM_HINT_T0);
        // Process current elements
        process_elements(&arr[i]);
    }
}

Common Pitfalls

Memory Fragmentation

Problem: Frequent allocations/deallocations Solution: Use memory pools or custom allocators

// Custom allocator
template<typename T>
class PoolAllocator {
    MemoryPool<T, 1024> pool;
public:
    T* allocate(size_t n) {
        return pool.allocate();
    }
};

False Sharing

Problem: Cache line contention Solution: Padding and alignment

struct alignas(64) ThreadData {
    double value;
    char padding[56]; // Ensure 64-byte alignment
};

Monitoring and Profiling

Memory Usage Analysis

# Monitor memory bandwidth
likwid-perfctr -C 0-11 -g MEM ./your_app

# Memory access pattern analysis
valgrind --tool=cachegrind ./your_app

Performance Metrics

Metric	Target	Impact
Cache Miss Rate	<5%	Critical
Memory Bandwidth	>80% Peak	High
NUMA Local Access	>90%	High
Page Faults	Minimal	Medium

System-Specific Optimizations

LUMILeonardoMareNostrum

# AMD EPYC optimization
module load rocm
export HSA_ENABLE_SDMA=0

# Intel optimization
module load intel
export KMP_AFFINITY=compact

# ARM optimization
module load arm-forge
export ARMPL_MEMORY_ADVISE=1

References

MPI Communication