Parallel Counting
Listing 3 shows an implementation where the counting Step #1 of the algorithm is parallel using the parallel_reduce construct, similar to the method used in the Parallel Counting Sort of [3 and 4]. Table 3 and Figure 3 show performance measurements of implementations in Listing 2 and 3 along with other algorithms.
// Copyright(c), Victor J. Duvanenko, 2010
// Listing 3
template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class _Type >
class RadixInPlaceOperation_3
{
_Type* my_a; // a local copy to the input array to provide a pointer to each parallel task
long* my_startOfBin;
long* my_endOfBin;
_Type my_bitMask; // a local copy of the bitMask
unsigned long my_shiftRightAmount;
static const unsigned long Threshold_P = 10000; // threshold when to switch between parallel and non-parallel implementations
public:
static const unsigned long numberOfBins = PowerOfTwoRadix;
unsigned long my_count[ numberOfBins ]; // the count for this task
RadixInPlaceOperation_3( _Type a[], long* startOfBin, long* endOfBin, _Type bitMask, unsigned long shiftRightAmount ) :
my_a( a ), my_startOfBin( startOfBin ), my_endOfBin( endOfBin ),
my_bitMask( bitMask ), my_shiftRightAmount( shiftRightAmount )
{}
void operator()( const blocked_range< long >& r ) const
{
for( long i = r.begin(); i != r.end(); ++i )
{
long numOfElements = my_endOfBin[ i ] - my_startOfBin[ i ];
if ( numOfElements >= Threshold_P )
_RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
else if ( numOfElements >= Threshold )
_RadixSort_Unsigned_PowerOf2Radix_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount );
else if ( numOfElements >= 2 )
insertionSortSimilarToSTLnoSelfAssignment( &my_a[ my_startOfBin[ i ]], numOfElements );
}
}
};
// Listing 3
template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class _Type >
inline void _RadixSort_Unsigned_PowerOf2Radix_3( _Type* a, long last, _Type bitMask, unsigned long shiftRightAmount )
{
const unsigned long numberOfBins = PowerOfTwoRadix;
CountingType_1< PowerOfTwoRadix, _Type > count( a, bitMask, shiftRightAmount ); // contains the count array, which is initialized to all zeros
// Scan the array and count the number of times each value appears
parallel_reduce( blocked_range< unsigned long >( 0, last + 1, 1000 ), count );
long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
for( unsigned long i = 1; i < numberOfBins; i++ )
startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count.my_count[ i - 1 ];
for ( long _current = 0; _current <= last; )
{
unsigned long digit;
_Type tmp = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
while ( true ) {
digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
if ( endOfBin[ digit ] == _current )
break;
_swap( tmp, a[ endOfBin[ digit ] ] );
endOfBin[ digit ]++;
}
a[ _current ] = tmp;
endOfBin[ digit ]++; // leave the element at its location and grow the bin
_current++; // advance the current pointer to the next element
while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
nextBin++;
while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
nextBin++;
if ( _current < endOfBin[ nextBin - 1 ] )
_current = endOfBin[ nextBin - 1 ];
}
bitMask >>= Log2ofPowerOfTwoRadix;
if ( bitMask != 0 ) // end recursion when all the bits have been processes
{
if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
else shiftRightAmount = 0;
RadixInPlaceOperation_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, _Type > radixOp( a, startOfBin, endOfBin, bitMask, shiftRightAmount );
parallel_for( blocked_range< long >( 0, numberOfBins ), radixOp );
}
}
The peak at array size of 100 appears at the threshold of the switch between using Insertion Sort and Radix Sort. This peak prompted reevaluation of the threshold value, which was reduced from 100 down to 25, improving performance slightly.
Listing 3 implementation is up to 13% faster than Listing 2, indicating that parallel counting improves performance. Parallel In-Place Radix Sort is significantly faster than STL sort, IPP Radix Sort and TBB parallel_sort. It is up to 8X faster than STL sort, consistently beating it in performance for all array sizes except 100 elements. It is also up to 2X faster than Intel IPP Radix Sort (which uses a single core), plus is in-place whereas IPP Radix Sort is not, requiring an additional array of equal size to the input array. It also up to 2X faster that TBB parallel_sort, consistently beating it in performance for all array sizes except 100 elements.


