LIVE / thrust /examples /cuda /custom_temporary_allocation.cu
Xu Ma
update
1c3c0d9
raw
history blame
5.1 kB
#include <thrust/system/cuda/vector.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <map>
#include <cassert>
// This example demonstrates how to control how Thrust allocates temporary
// storage during algorithms such as thrust::sort. The idea will be to create a
// simple cache of allocations to search when temporary storage is requested.
// If a hit is found in the cache, we quickly return the cached allocation
// instead of resorting to the more expensive thrust::cuda::malloc.
// Note: Thrust now has its own caching allocator layer; if you just need a
// caching allocator, you ought to use that. This example is still useful
// as a demonstration of how to use a Thrust custom allocator.
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.
struct not_my_pointer
{
not_my_pointer(void* p)
: message()
{
std::stringstream s;
s << "Pointer `" << p << "` was not allocated by this allocator.";
message = s.str();
}
virtual ~not_my_pointer() {}
virtual const char* what() const
{
return message.c_str();
}
private:
std::string message;
};
// A simple allocator for caching cudaMalloc allocations.
struct cached_allocator
{
typedef char value_type;
cached_allocator() {}
~cached_allocator()
{
free_all();
}
char *allocate(std::ptrdiff_t num_bytes)
{
std::cout << "cached_allocator::allocate(): num_bytes == "
<< num_bytes
<< std::endl;
char *result = 0;
// Search the cache for a free block.
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocate(): found a free block"
<< std::endl;
result = free_block->second;
// Erase from the `free_blocks` map.
free_blocks.erase(free_block);
}
else
{
// No allocation of the right size exists, so create a new one with
// `thrust::cuda::malloc`.
try
{
std::cout << "cached_allocator::allocate(): allocating new block"
<< std::endl;
// Allocate memory and convert the resulting `thrust::cuda::pointer` to
// a raw pointer.
result = thrust::cuda::malloc<char>(num_bytes).get();
}
catch (std::runtime_error&)
{
throw;
}
}
// Insert the allocated pointer into the `allocated_blocks` map.
allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}
void deallocate(char *ptr, size_t)
{
std::cout << "cached_allocator::deallocate(): ptr == "
<< reinterpret_cast<void*>(ptr) << std::endl;
// Erase the allocated block from the allocated blocks map.
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
throw not_my_pointer(reinterpret_cast<void*>(ptr));
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// Insert the block into the free blocks map.
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
private:
typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
typedef std::map<char*, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
void free_all()
{
std::cout << "cached_allocator::free_all()" << std::endl;
// Deallocate all outstanding blocks in both lists.
for ( free_blocks_type::iterator i = free_blocks.begin()
; i != free_blocks.end()
; ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for( allocated_blocks_type::iterator i = allocated_blocks.begin()
; i != allocated_blocks.end()
; ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
}
};
int main()
{
std::size_t num_elements = 32768;
thrust::host_vector<int> h_input(num_elements);
// Generate random input.
thrust::generate(h_input.begin(), h_input.end(), rand);
thrust::cuda::vector<int> d_input = h_input;
thrust::cuda::vector<int> d_result(num_elements);
std::size_t num_trials = 5;
cached_allocator alloc;
for (std::size_t i = 0; i < num_trials; ++i)
{
d_result = d_input;
// Pass alloc through cuda::par as the first parameter to sort
// to cause allocations to be handled by alloc during sort.
thrust::sort(thrust::cuda::par(alloc), d_result.begin(), d_result.end());
// Ensure the result is sorted.
assert(thrust::is_sorted(d_result.begin(), d_result.end()));
}
return 0;
}