BitMagic-C++
bmrandom.h
Go to the documentation of this file.
1 #ifndef BMRANDOM__H__INCLUDED__
2 #define BMRANDOM__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmrandom.h
22  \brief Generation of random subset
23 */
24 
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30 
31 
32 #include "bmfunc.h"
33 #include "bmdef.h"
34 
35 #include <stdlib.h>
36 #include <random>
37 #include <algorithm>
38 
39 namespace bm
40 {
41 
42 
43 /*!
44  Class implements algorithm for random subset generation.
45 
46  Implemented method tries to be fair, but doesn't guarantee
47  true randomeness or absense of bias.
48 
49  Performace note:
50  Class holds temporary buffers and variables, so it is recommended to
51  re-use instances over multiple calls.
52 
53  \ingroup setalgo
54 */
55 template<class BV>
57 {
58 public:
59  typedef BV bvector_type;
60  typedef typename BV::size_type size_type;
61 public:
62  random_subset();
64 
65  /// Get random subset of input vector
66  ///
67  /// @param bv_out - destination vector
68  /// @param bv_in - input vector
69  /// @param sample_count - number of bits to pick
70  ///
71  void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72 
73 
74 private:
75  typedef
76  typename BV::blocks_manager_type blocks_manager_type;
77 
78 private:
79  /// simple picking algorithm for small number of items
80  /// in [first,last] range
81  ///
82  void simple_pick(BV& bv_out,
83  const BV& bv_in,
84  size_type sample_count,
85  size_type first,
86  size_type last);
87 
88  void get_subset(BV& bv_out,
89  const BV& bv_in,
90  size_type in_count,
91  size_type sample_count);
92 
93  void get_block_subset(bm::word_t* blk_out,
94  const bm::word_t* blk_src,
95  unsigned count);
96  static
97  unsigned process_word(bm::word_t* blk_out,
98  const bm::word_t* blk_src,
99  unsigned nword,
100  unsigned take_count);
101 
102  static
103  void get_random_array(bm::word_t* blk_out,
105  unsigned bit_list_size,
106  unsigned count);
107  static
108  unsigned compute_take_count(unsigned bc,
109  size_type in_count, size_type sample_count);
110 
111 
112 private:
114  random_subset& operator=(const random_subset&);
115 private:
116  typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117  bvector_type bv_nb_; ///< blocks vector
118  bm::gap_word_t bit_list_[bm::gap_max_bits];
119  bm::word_t* sub_block_;
120 };
121 
122 
123 ///////////////////////////////////////////////////////////////////
124 
125 
126 
127 template<class BV>
129 {
130  sub_block_ = new bm::word_t[bm::set_block_size];
131 }
132 
133 template<class BV>
135 {
136  delete [] sub_block_;
137 }
138 
139 template<class BV>
140 void random_subset<BV>::sample(BV& bv_out,
141  const BV& bv_in,
142  size_type sample_count)
143 {
144  if (sample_count == 0)
145  {
146  bv_out.clear(true);
147  return;
148  }
149 
150  rsi_.init();
151  bv_in.build_rs_index(&rsi_, &bv_nb_);
152 
153  size_type in_count = rsi_.count();
154  if (in_count <= sample_count)
155  {
156  bv_out = bv_in;
157  return;
158  }
159 
160  float pick_ratio = float(sample_count) / float(in_count);
161  if (pick_ratio < 0.054f)
162  {
163  size_type first, last;
164  bool b = bv_in.find_range(first, last);
165  if (!b)
166  return;
167 
168  simple_pick(bv_out, bv_in, sample_count, first, last);
169  return;
170  }
171 
172  if (sample_count > in_count/2)
173  {
174  // build the complement vector and subtract it
175  BV tmp_bv;
176  size_type delta_count = in_count - sample_count;
177 
178  get_subset(tmp_bv, bv_in, in_count, delta_count);
179  bv_out = bv_in;
180  bv_out -= tmp_bv;
181  return;
182  }
183  get_subset(bv_out, bv_in, in_count, sample_count);
184 }
185 
186 template<class BV>
187 void random_subset<BV>::simple_pick(BV& bv_out,
188  const BV& bv_in,
189  size_type sample_count,
190  size_type first,
191  size_type last)
192 {
193  bv_out.clear(true);
194 
195  std::random_device rd;
196  #ifdef BM64ADDR
197  std::mt19937_64 mt_rand(rd());
198  #else
199  std::mt19937 mt_rand(rd());
200  #endif
201  std::uniform_int_distribution<size_type> dist(first, last);
202 
203  while (sample_count)
204  {
205  size_type fidx;
206  size_type ridx = dist(mt_rand); // generate random position
207 
208  BM_ASSERT(ridx >= first && ridx <= last);
209 
210  bool b = bv_in.find(ridx, fidx); // find next valid bit after random
211  BM_ASSERT(b);
212  if (b)
213  {
214  // set true if was false
215  bool is_set = bv_out.set_bit_conditional(fidx, true, false);
216  sample_count -= is_set;
217  while (!is_set) // find next valid (and not set) bit
218  {
219  ++fidx;
220  // searching always left to right may create a bias...
221  b = bv_in.find(fidx, fidx);
222  if (!b)
223  break;
224  if (fidx > last)
225  break;
226  is_set = bv_out.set_bit_conditional(fidx, true, false);
227  sample_count -= is_set;
228  } // while
229  }
230  } // while
231 }
232 
233 
234 template<class BV>
235 void random_subset<BV>::get_subset(BV& bv_out,
236  const BV& bv_in,
237  size_type in_count,
238  size_type sample_count)
239 {
240  bv_out.clear(true);
241  bv_out.resize(bv_in.size());
242 
243  const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
244  blocks_manager_type& bman_out = bv_out.get_blocks_manager();
245 
246  bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
247 
248  size_type first_nb, last_nb;
249  bool b = bv_nb_.find_range(first_nb, last_nb);
250  BM_ASSERT(b);
251  if (!b)
252  return;
253 
254  std::random_device rd;
255  #ifdef BM64ADDR
256  std::mt19937_64 mt_rand(rd());
257  #else
258  std::mt19937 mt_rand(rd());
259  #endif
260  std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
261 
262  size_type curr_sample_count = sample_count;
263  for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
264  {
265  // pick block at random
266  //
267  size_type nb;
268  size_type ridx = dist_nb(mt_rand); // generate random block idx
269  BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
270 
271  b = bv_nb_.find(ridx, nb); // find next valid nb
272  if (!b)
273  {
274  b = bv_nb_.find(first_nb, nb);
275  if (!b)
276  {
277  b = bv_nb_.find(first_nb, nb);
278  BM_ASSERT(!bv_nb_.any());
279  BM_ASSERT(b);
280  return; // cannot find block
281  }
282  }
283  bv_nb_.clear_bit_no_check(nb); // remove from blocks list
284 
285  // calculate proportinal sample count
286  //
287  unsigned bc = rsi_.count(nb);
288  BM_ASSERT(bc && (bc <= 65536));
289  take_count = compute_take_count(bc, in_count, sample_count);
290  if (take_count > curr_sample_count)
291  take_count = unsigned(curr_sample_count);
292  BM_ASSERT(take_count);
293  if (!take_count)
294  continue;
295  {
296  unsigned i0, j0;
297  bman_in.get_block_coord(nb, i0, j0);
298  const bm::word_t* blk_src = bman_in.get_block(i0, j0);
299  BM_ASSERT(blk_src);
300 
301  // allocate target block
302  bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
303  BM_ASSERT(!blk_out);
304  if (blk_out)
305  {
306  blk_out = bman_out.deoptimize_block(nb);
307  }
308  else
309  {
310  blk_out = bman_out.get_allocator().alloc_bit_block();
311  bman_out.set_block(nb, blk_out);
312  }
313  if (take_count == bc) // whole block take (strange)
314  {
315  // copy the whole src block
316  if (BM_IS_GAP(blk_src))
317  bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
318  else
319  bm::bit_block_copy(blk_out, blk_src);
320  continue;
321  }
322  bm::bit_block_set(blk_out, 0);
323  if (bc < 4096) // use array shuffle
324  {
325  unsigned arr_len;
326  // convert source block to bit-block
327  if (BM_IS_GAP(blk_src))
328  {
329  arr_len = bm::gap_convert_to_arr(bit_list_,
330  BMGAP_PTR(blk_src),
332  }
333  else // bit-block
334  {
335  arr_len = bm::bit_convert_to_arr(bit_list_,
336  blk_src,
339  0);
340  }
341  BM_ASSERT(arr_len);
342  get_random_array(blk_out, bit_list_, arr_len, take_count);
343  }
344  else // dense block
345  {
346  // convert source block to bit-block
347  if (BM_IS_GAP(blk_src))
348  {
349  bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
350  blk_src = tmp_block;
351  }
352  // pick random bits source block to target
353  get_block_subset(blk_out, blk_src, take_count);
354  }
355  }
356  } // for
357 }
358 
359 template<class BV>
360 unsigned random_subset<BV>::compute_take_count(unsigned bc,
361  size_type in_count,
362  size_type sample_count)
363 {
364  float block_percent = float(bc) / float(in_count);
365  float bits_to_take = float(sample_count) * block_percent;
366  bits_to_take += 0.99f;
367  unsigned to_take = unsigned(bits_to_take);
368  if (to_take > bc)
369  to_take = bc;
370  return to_take;
371 }
372 
373 
374 template<class BV>
375 void random_subset<BV>::get_block_subset(bm::word_t* blk_out,
376  const bm::word_t* blk_src,
377  unsigned take_count)
378 {
379  for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
380  {
381  // pick random scan start and scan direction
382  //
383  unsigned i = unsigned(rand()) % bm::set_block_size;
384  unsigned new_count;
385 
386  for (; i < bm::set_block_size && take_count; ++i)
387  {
388  if (blk_src[i] && (blk_out[i] != blk_src[i]))
389  {
390  new_count = process_word(blk_out, blk_src, i, take_count);
391  take_count -= new_count;
392  }
393  } // for i
394 
395  } // for
396  // if masked scan did not produce enough results..
397  //
398  if (take_count)
399  {
400  // Find all vacant bits: do logical (src SUB out)
401  for (unsigned i = 0; i < bm::set_block_size; ++i)
402  {
403  sub_block_[i] = blk_src[i] & ~blk_out[i];
404  }
405  // now transform vacant bits to array, then pick random elements
406  //
407  unsigned arr_len = bit_convert_to_arr(bit_list_,
408  sub_block_,
411  0);
412  BM_ASSERT(arr_len);
413  get_random_array(blk_out, bit_list_, arr_len, take_count);
414  }
415 }
416 
417 template<class BV>
418 unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
419  const bm::word_t* blk_src,
420  unsigned nword,
421  unsigned take_count)
422 {
423  unsigned new_bits, mask;
424  do
425  {
426  mask = unsigned(rand());
427  mask ^= mask << 16;
428  } while (mask == 0);
429 
430  unsigned src_rand = blk_src[nword] & mask;
431  new_bits = src_rand & ~blk_out[nword];
432  if (new_bits)
433  {
434  unsigned new_count = bm::word_bitcount(new_bits);
435 
436  // check if we accidentally picked more bits than needed
437  if (new_count > take_count)
438  {
439  BM_ASSERT(take_count);
440 
441  unsigned char blist[64];
442  unsigned arr_size = bm::bitscan_popcnt(new_bits, blist);
443  BM_ASSERT(arr_size == new_count);
444  std::random_shuffle(blist, blist + arr_size);
445  unsigned value = 0;
446  for (unsigned j = 0; j < take_count; ++j)
447  {
448  value |= (1u << blist[j]);
449  }
450  new_bits = value;
451  new_count = take_count;
452 
453  BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
454  BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
455  }
456 
457  blk_out[nword] |= new_bits;
458  return new_count;
459  }
460  return 0;
461 }
462 
463 
464 template<class BV>
465 void random_subset<BV>::get_random_array(bm::word_t* blk_out,
467  unsigned bit_list_size,
468  unsigned count)
469 {
470  std::random_shuffle(bit_list, bit_list + bit_list_size);
471  for (unsigned i = 0; i < count; ++i)
472  {
473  bm::set_bit(blk_out, bit_list[i]);
474  }
475 }
476 
477 } // namespace
478 
479 #include "bmundef.h"
480 
481 #endif
bm::bvector::rs_index_type
rs_index< allocator_type > rs_index_type
Definition: bm.h:1255
bmfunc.h
Bit manipulation primitives (internal)
bm::random_subset::bvector_type
BV bvector_type
Definition: bmrandom.h:59
BM_IS_GAP
#define BM_IS_GAP(ptr)
Definition: bmdef.h:168
bm::bit_convert_to_arr
T bit_convert_to_arr(T *BMRESTRICT dest, const unsigned *BMRESTRICT src, bm::id_t bits, unsigned dest_len, unsigned mask=0)
Convert bit block into an array of ints corresponding to 1 bits.
Definition: bmfunc.h:6978
bm::set_block_size
const unsigned set_block_size
Definition: bmconst.h:54
bm::word_bitcount
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w)
Definition: bmfunc.h:195
BMGAP_PTR
#define BMGAP_PTR(ptr)
Definition: bmdef.h:166
bm::bit_block_set
void bit_block_set(bm::word_t *BMRESTRICT dst, bm::word_t value)
Bitblock memset operation.
Definition: bmfunc.h:3492
bm::gap_convert_to_bitset
void gap_convert_to_bitset(unsigned *dest, const T *buf)
GAP block to bitblock conversion.
Definition: bmfunc.h:3510
bmundef.h
pre-processor un-defines to avoid global space pollution (internal)
bm::random_subset::sample
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition: bmrandom.h:140
bm::set_bit
BMFORCEINLINE void set_bit(unsigned *dest, unsigned bitpos)
Set 1 bit in a block.
Definition: bmfunc.h:2790
bm::gap_word_t
unsigned short gap_word_t
Definition: bmconst.h:76
BM_ASSERT
#define BM_ASSERT
Definition: bmdef.h:117
bm::bit_list
unsigned bit_list(T w, B *bits)
Unpacks word into list of ON bit indexes.
Definition: bmfunc.h:426
bm::random_subset
Definition: bmrandom.h:56
bm::random_subset::random_subset
random_subset()
Definition: bmrandom.h:128
bmdef.h
Definitions(internal)
bm::gap_max_bits
const unsigned gap_max_bits
Definition: bmconst.h:79
bm::random_subset::~random_subset
~random_subset()
Definition: bmrandom.h:134
bm
Definition: bm.h:76
bm::bit_block_copy
void bit_block_copy(bm::word_t *BMRESTRICT dst, const bm::word_t *BMRESTRICT src)
Bitblock copy operation.
Definition: bmfunc.h:5067
bm::gap_convert_to_arr
D gap_convert_to_arr(D *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned dest_len, bool invert=false)
Convert gap block into array of ints corresponding to 1 bits.
Definition: bmfunc.h:3913
bm::word_t
unsigned int word_t
Definition: bmconst.h:38
bm::bitscan_popcnt
unsigned short bitscan_popcnt(bm::id_t w, B *bits, unsigned short offs)
Unpacks word into list of ON bit indexes using popcnt method.
Definition: bmfunc.h:461
bm::random_subset::size_type
BV::size_type size_type
Definition: bmrandom.h:60