Arndt - Algorithms for Programmers (523138), страница 32
Текст из файла (страница 32)
With modern CPUs and theirconditional move instructions these are not necessarily optimal:static inline long max0(long x)// Return max(0, x), i.e. return zero for negative input// No restriction on input range{return x & ~(x >> (BITS_PER_LONG-1));}orstatic inline ulong upos_abs_diff(ulong a, ulong b)// Return abs(a-b)// Both a and b must not have the most significant bit set{long d1 = b - a;long d2 = (d1 & (d1>>(BITS_PER_LONG-1)))<<1;return d1 - d2; // == (b - d) - (a + d);}The ideas used are sometimes interesting on their own:static inline ulong average(ulong x, ulong y)// Return (x+y)/2// Result is correct even if (x+y) wouldn’t fit into a ulong// Use the fact that x+y == ((x&y)<<1) + (x^y)//that is:sum == carries+ sum_without_carries{return (x & y) + ((x ^ y) >> 1);}orstatic inline void upos_sort2(ulong &a, ulong &b)// Set {a, b} := {minimum(a, b), maximum(a,b)}// Both a and b must not have the most significant bit set{long d = b - a;d &= (d>>(BITS_PER_LONG-1));a += d;b -= d;}Note that the upos_*() functions only work for a limited range (highest bit must not be set) in order tohave the highest bit emulate the carry flag.static inline ulong contains_zero_byte(ulong x)// Determine if any sub-byte of x is zero.// Returns zero when x contains no zero byte and nonzero when it does.// The idea is to subtract 1 from each of the bytes and then look for bytes//where the borrow propagated all the way to the most significant bit.// To scan for other values than zero (e.g.
0xa5) use:// contains_zero_byte( x ^ 0xa5a5a5a5UL ){CHAPTER 8. SOME BIT WIZARDRY186#ifBITS_PER_LONG == 32return ((x-0x01010101UL)^x) & (~x) & 0x80808080UL;// return ((x-0x01010101UL) ^ x) & 0x80808080UL;// ... gives false alarms when a byte of x is 0x80://hex: 80-01 = 7f, 7f^80 = ff, ff & 80 = 80#endif#if BITS_PER_LONG == 64return ((x-0x0101010101010101UL) ^ x) & (~x) & 0x8080808080808080UL;#endif}from [FXT: file auxbit/zerobyte.h] may only be a gain for ≥128 bit words (cf. [FXT: long strlen andlong memchr in aux1/bytescan.cc]), however, the underlying idea is nice enough to be documentedhere.8.19Hilbert’s space-filling curveThe famous space-filling curve of Hilbert can be used to define a mapping from a one-dimensional coordinate (linear coordinate of the curve) to the pair of coordinates x and y.
The function ([FXT: hilbertin auxbit/hilbert.cc]) that does the job is a state-engine as suggested in [53], item 115:voidhilbert(ulong t, ulong &x, ulong &y)// Transform linear coordinate to hilbert x and y{ulong xv = 0, yv = 0;ulong c01 = (0<<2); // (2<<2) for transposed output (swapped x, y)for (ulong i=0; i<(BITS_PER_LONG/2); ++i){ulong abi = t >> (BITS_PER_LONG-2);t <<= 2;ulong st = htab[ (c01<<2) | abi ];c01 = st & 3;yv <<= 1;yv |= ((st>>2) & 1);xv <<= 1;xv |= (st>>3);}x = xv; y = yv;}The table used is defined asstatic const ulong htab[] = {#define HT(xi,yi,c0,c1) ((xi<<3)+(yi<<2)+(c0<<1)+(c1))// index == HT(c0,c1,ai,bi)HT( 0, 0, 1, 0 ),HT( 0, 1, 0, 0 ),HT( 1, 1, 0, 0 ),HT( 1, 0, 0, 1 ),HT( 1, 1, 1, 1 ),HT( 0, 1, 0, 1 ),HT( 0, 0, 0, 1 ),HT( 1, 0, 0, 0 ),HT( 0, 0, 0, 0 ),HT( 1, 0, 1, 0 ),HT( 1, 1, 1, 0 ),HT( 0, 1, 1, 1 ),HT( 1, 1, 0, 1 ),HT( 1, 0, 1, 1 ),HT( 0, 0, 1, 1 ),HT( 0, 1, 1, 0 )#undef HT};Apart from the nice images the algorithm can be used to generate a Gray code using the followingobservation: with each increment by one of the linear coordinate t exactly one of the generated x and yCHAPTER 8.
SOME BIT WIZARDRY187changes its value be ±1. Therefore the concatenation of the4 Gray codes of x and y gives another Graycode. It turns out that the routine is most effective if the bits of (the Gray code of) x and y are interlacedas with the bit-zip function given in section 8.16:ulonghilbert_gray_code(ulong t)//// A Gray code based on the function hilbert().// Equivalent to the following sequence of statements://hilbert(t, x, y);//x=gray_code(x);//y=gray_code(y);//return bitzip(y, x);{ulong g = 0;ulong c01 = (0<<2); // (2<<2) for transposed output (swapped x, y)for (ulong i=0; i<(BITS_PER_LONG/2); ++i){ulong abi = t >> (BITS_PER_LONG-2);t <<= 2;ulong st = htab[ (c01<<2) | abi ];c01 = st & 3;g <<= 2;g |= (st>>2);}#if ( BITS_PER_LONG <= 32 )g ^= ( (g & 0x55555555UL) >> 2 );g ^= ( (g & 0xaaaaaaaaUL) >> 2 );#elseg ^= ( (g & 0x5555555555555555UL) >> 2 );g ^= ( (g & 0xaaaaaaaaaaaaaaaaUL) >> 2 );#endifreturn g;}The demo ([FXT: file demo/hilbert-demo.cc]) should clarify the idea, it gives the output##############################t01234567891011121314151617181920212223242526272829->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->->4 actuallybinary(x)x=......x=.....1x=.....1x=......x=......x=......x=.....1x=.....1x=....1.x=....1.x=....11x=....11x=....11x=....1.x=....1.x=....11x=...1..x=...1..x=...1.1x=...1.1x=...11.x=...111x=...111x=...11.x=...11.x=...111x=...111x=...11.x=...1.1x=...1.1binary(y)=y=......
=y=...... =y=.....1 =y=.....1 =y=....1. =y=....11 =y=....11 =y=....1. =y=....1. =y=....11 =y=....11 =y=....1. =y=.....1 =y=.....1 =y=...... =y=...... =y=...... =y=.....1 =y=.....1 =y=...... =y=...... =y=...... =y=.....1 =y=.....1 =y=....1. =y=....1. =y=....11 =y=....11 =y=....11 =y=....1. =(((((((((((((((((((((((((((((((x,0,1,1,0,0,0,1,1,2,2,3,3,3,2,2,3,4,4,5,5,6,7,7,6,6,7,7,6,5,5,y)0)0)1)1)2)3)3)2)2)3)3)2)1)1)0)0)0)1)1)0)0)0)1)1)2)2)3)3)3)2)gx=gray(x)gx=......gx=.....1gx=.....1gx=......gx=......gx=......gx=.....1gx=.....1gx=....11gx=....11gx=....1.gx=....1.gx=....1.gx=....11gx=....11gx=....1.gx=...11.gx=...11.gx=...111gx=...111gx=...1.1gx=...1..gx=...1..gx=...1.1gx=...1.1gx=...1..gx=...1..gx=...1.1gx=...111gx=...111gy=gray(y)gy=......gy=......gy=.....1gy=.....1gy=....11gy=....1.gy=....1.gy=....11gy=....11gy=....1.gy=....1.gy=....11gy=.....1gy=.....1gy=......gy=......gy=......gy=.....1gy=.....1gy=......gy=......gy=......gy=.....1gy=.....1gy=....11gy=....11gy=....1.gy=....1.gy=....1.gy=....11any Gray code does it, we use the standard binary version here.z=bitzip(gx,gy) == gz=......g=......z=....1.g=....1.z=....11g=....11z=.....1g=.....1z=...1.1g=...1.1z=...1..g=...1..z=...11.g=...11.z=...111g=...111z=..1111g=..1111z=..111.g=..111.z=..11..g=..11..z=..11.1g=..11.1z=..1..1g=..1..1z=..1.11g=..1.11z=..1.1.g=..1.1.z=..1...g=..1...z=1.1...g=1.1...z=1.1..1g=1.1..1z=1.1.11g=1.1.11z=1.1.1.g=1.1.1.z=1...1.g=1...1.z=1.....g=1.....z=1....1g=1....1z=1...11g=1...11z=1..111g=1..111z=1..1.1g=1..1.1z=1..1..g=1..1..z=1..11.g=1..11.z=1.111.g=1.111.z=1.1111g=1.1111CHAPTER 8.
SOME BIT WIZARDRY# 30 -># 31 ->8.20x=...1..x=...1..y=....1. = ( 4,y=....11 = ( 4,1882)3)gx=...11.gx=...11.gy=....11gy=....1.z=1.11.1z=1.11..g=1.11.1g=1.11..Manipulation of colorsIn the following it is assumed that the type uint (unsigned integer) contains at least 32 bit. In thissection This data type is exclusively used as a container for three color channels that are assumed to be8 bit each and lie at the lower end of the word. The functions do not depend on how the channels areordered (e.g. RGB or BGR).The following functions are obviously candidates for your CPUs SIMD-extensions (if it has any).
However,having the functionality in a platform independent manner that is sufficiently fast for most practicalpurposes5 is reason enough to include this section.Scaling a color by an integer value:static inline uint color01(uint c, ulong v)// return color with each channel scaled by v// 0 <= v <= (1<<16) corresponding to 0.0 ... 1.0{uint t;t = c & 0xff00ff00; // must include alpha channel bits ...c ^= t; // ... because they must be removed heret *= v;t >>= 24; t <<= 8;v >>= 8;c *= v;c >>= 8;c &= 0xff00ff;return c | t;}. . . used in the computation of the weighted average of colors:static inline uint color_mix(uint c1, uint c2, ulong v)// return channelwise average of colors// (1.0-v)*c1 and v*c2//// 0 <= v <= (1<<16) corresponding to 0.0 ...
1.0// c1...c2{ulong w = ((ulong)1<<16)-v;c1 = color01(c1, w);c2 = color01(c2, v);return c1 + c2; // no overflow in color channels}Channel-wise average of two colors:static inline uint color_mix_50(uint c1, uint c2)// return channelwise average of colors c1 and c2//// shortcut for the special case (50% transparency)//of color_mix(c1, c2, "0.5")//// least significant bits are ignored{return ((c1 & 0xfefefe) + (c2 & 0xfefefe)) >> 1;}// 50% c1. . .
and with higher weight of the first color:static inline uint color_mix_75(uint c1, uint c2)// least significant bits are ignored5 The software rendering program that uses these functions operates at a not too small fraction of memory bandwidthwhen all of environment mapping, texture mapping and translucent objects are shown with (very) simple scenes.CHAPTER 8. SOME BIT WIZARDRY{}returncolor_mix_50(c1, color_mix_50(c1, c2));189// 75% c1Saturated addition of color channels:static inline uint color_sum(uint c1, uint c2)// least significant bits are ignored{uint s = color_mix_50(c1, c2);return color_sum_adjust(s);}which uses:static inline uint color_sum_adjust(uint s)// set color channel to max (0xff) iff an overflow occured// (that is, leftmost bit in channel is set){uint m = s & 0x808080; // 1000 0000 // overflow bitss ^= m;m >>= 7;// 0000 0001m *= 0xff; // 1111 1111 // optimized to (m<<8)-m by gccreturn (s << 1) | m;}Channel-wise product of two colors:static inline uint color_mult(uint c1, uint c2)// corresponding to an object of color c1// illuminated by a light of color c2{uint t = ((c1 & 0xff) * (c2 & 0xff)) >> 8;c1 >>= 8; c2 >>= 8;t |= ((c1 & 0xff) * (c2 & 0xff)) & 0xff00;c1 &= 0xff00; c2 >>= 8;t |= ((c1 * c2) & 0xff0000);return t;}When one does not want to discard the lowest channel bits (e.g.
because numerous such operations appearin a row) a more ‘perfect’ version is required:static inline uint perfect_color_mix_50(uint c1, uint c2)// return channelwise average of colors c1 and c2{//uint t = (c1 | c2) & 0x010101; // lowest channels bits in any arguint t = (c1 & c2) & 0x010101; // lowest channels bits in both argsreturn color_mix_50(c1, c2) + t;}static inline uint perfect_color_sum(uint c1, uint c2){uint srb = (c1 & 0xff00ff) + (c2 & 0xff00ff) + 0x010001;uint mrb = srb & 0x01000100;srb ^= mrb;uint sg = (c1 & 0xff00) + (c2 & 0xff00) + 0x0100;uint mg = (sg & 0x010000);sg ^= mg;uint m = (mrb | mg) >> 1; // 1000 0000 // overflow bitsm |= (m >> 1); // 1100 0000m |= (m >> 2); // 1111 0000m |= (m >> 4); // 1111 1111return srb | sg | m;}Note that the last two functions are overkill for most practical purposes.CHAPTER 8.
SOME BIT WIZARDRY8.211902-adic inverse and rootThe 2-adic inverse can be computed using an iteration (see 13.3) with quadratic convergence. The numberto be inverted has to be odd.inline ulong inv2adic(ulong x)// Return inverse modulo 2**BITS_PER_LONG// x must be odd// number of correct bits are doubled with each step// ==> loop is executed prop. log_2(BITS_PER_LONG) times// precision is 3, 6, 12, 24, 48, 96, ... bits (or better){if ( 0==(x&1) ) return 0; // not invertibleulong i = x; // correct to three bits at leastulong p;do{p = i * x;i *= (2UL - p);}while ( p!=1 );return i;}With the inverse square root we choose the start value to match bd/2c + 1 as that guarantees four bitsof initial precision. Moreover, we get control to which of the two possible values the inverse square rootis finally reached.