FACT++  1.0
huffman.h
Go to the documentation of this file.
1 #ifndef FACT_huffman
2 #define FACT_huffman
3 
4 #include <string.h>
5 #include <stdint.h>
6 
7 #include <set>
8 #include <string>
9 #include <vector>
10 
11 #define MAX_SYMBOLS (1<<16)
12 
13 // ================================================================
14 
15 namespace Huffman
16 {
17  static unsigned long numbytes_from_numbits(unsigned long numbits)
18  {
19  return numbits / 8 + (numbits % 8 ? 1 : 0);
20  }
21 
22  struct TreeNode
23  {
25  union
26  {
27  struct
28  {
30  };
31  uint16_t symbol;
32  };
33 
34  size_t count;
35  bool isLeaf;
36 
37  TreeNode(uint16_t sym, size_t cnt=0) : parent(0), isLeaf(true)
38  {
39  symbol = sym;
40  count = cnt;
41  }
42 
43  TreeNode(TreeNode *n0=0, TreeNode *n1=0) : parent(0), isLeaf(false)
44  {
45  count = n0 && n1 ? n0->count + n1->count : 0;
46  zero = n0 && n1 ? (n0->count > n1->count ? n0 : n1) : NULL;
47  one = n0 && n1 ? (n0->count > n1->count ? n1 : n0) : NULL;
48 
49  if (n0)
50  n0->parent = this;
51 
52  if (n1)
53  n1->parent = this;
54  }
55 
57  {
58  if (isLeaf)
59  return;
60 
61  if (zero)
62  delete zero;
63  if (one)
64  delete one;
65  }
66 
67  bool operator() (const TreeNode *hn1, const TreeNode *hn2) const
68  {
69  return hn1->count < hn2->count;
70  }
71  };
72 
73 
74  struct Encoder
75  {
76  struct Code
77  {
78  size_t bits;
79  uint8_t numbits;
80 
81  Code() : numbits(0) { }
82  };
83 
84  size_t count;
85  Code lut[1<<16];
86 
87  bool CreateEncoder(const TreeNode *n, size_t bits=0, uint8_t nbits=0)
88  {
89  if (n->isLeaf)
90  {
91 #ifdef __EXCEPTIONS
92  if (nbits>sizeof(size_t)*8)
93  throw std::runtime_error("Too many different symbols - this should not happen!");
94 #else
95  if (nbits>sizeof(size_t)*8)
96  {
97  count = 0;
98  return false;
99  }
100 #endif
101  lut[n->symbol].bits = bits;
102  lut[n->symbol].numbits = nbits==0 ? 1 : nbits;
103  count++;
104  return true;
105  }
106 
107  return
108  CreateEncoder(n->zero, bits, nbits+1) &&
109  CreateEncoder(n->one, bits | (1<<nbits), nbits+1);
110 
111  }
112 
113  void WriteCodeTable(std::string &out) const
114  {
115  out.append((char*)&count, sizeof(size_t));
116 
117  for (uint32_t i=0; i<MAX_SYMBOLS; i++)
118  {
119  const Code &n = lut[i];
120  if (n.numbits==0)
121  continue;
122 
123  // Write the 2 byte symbol.
124  out.append((char*)&i, sizeof(uint16_t));
125  if (count==1)
126  return;
127 
128  // Write the 1 byte code bit length.
129  out.append((char*)&n.numbits, sizeof(uint8_t));
130 
131  // Write the code bytes.
132  uint32_t numbytes = numbytes_from_numbits(n.numbits);
133  out.append((char*)&n.bits, numbytes);
134  }
135  }
136 
137  void Encode(std::string &out, const uint16_t *bufin, size_t bufinlen) const
138  {
139  if (count==1)
140  return;
141 
142  uint8_t curbyte = 0;
143  uint8_t curbit = 0;
144 
145  for (uint32_t i=0; i<bufinlen; ++i)
146  {
147  const uint16_t &symbol = bufin[i];
148 
149  const Code *code = lut+symbol;
150 
151  uint8_t nbits = code->numbits;
152  const uint8_t *bits = (uint8_t*)&code->bits;
153 
154  while (nbits>0)
155  {
156  // Number of bits available in the current byte
157  const uint8_t free_bits = 8 - curbit;
158 
159  // Write bits to current byte
160  curbyte |= *bits<<curbit;
161 
162  // If the byte has been filled, put it into the output buffer
163  // If the bits exceed the current byte step to the next byte
164  // and fill it properly
165  if (nbits>=free_bits)
166  {
167  out += curbyte;
168  curbyte = *bits>>free_bits;
169 
170  bits++;
171  }
172 
173  // Adapt the number of available bits, the number of consumed bits
174  // and the bit-pointer accordingly
175  const uint8_t consumed = nbits>8 ? 8 : nbits;
176  nbits -= consumed;
177  curbit += consumed;
178  curbit %= 8;
179  }
180  }
181 
182  // If the buffer-byte is half-full, also add it to the output buffer
183  if (curbit>0)
184  out += curbyte;
185  }
186 
187  Encoder(const uint16_t *bufin, size_t bufinlen) : count(0)
188  {
189  uint64_t counts[MAX_SYMBOLS];
190  memset(counts, 0, sizeof(uint64_t)*MAX_SYMBOLS);
191 
192  // Count occurances
193  for (const uint16_t *p=bufin; p<bufin+bufinlen; p++)
194  counts[*p]++;
195 
196  // Copy all occuring symbols into a sorted list
197  std::multiset<TreeNode*, TreeNode> set;
198  for (int i=0; i<MAX_SYMBOLS; i++)
199  if (counts[i])
200  set.insert(new TreeNode(i, counts[i]));
201 
202  // Create the tree bottom-up
203  while (set.size()>1)
204  {
205  auto it = set.begin();
206 
207  auto it1 = it++;
208  auto it2 = it;
209 
210  TreeNode *nn = new TreeNode(*it1, *it2);
211 
212  set.erase(it1, ++it2);
213 
214  set.insert(nn);
215  }
216 
217  // get the root of the tree
218  const TreeNode *root = *set.begin();
219 
220  CreateEncoder(root);
221 
222  // This will delete the whole tree
223  delete root;
224  }
225 
226  };
227 
228 
229 
230  struct Decoder
231  {
232  uint16_t symbol;
233  uint8_t nbits;
234  bool isLeaf;
235 
237 
238  Decoder() : isLeaf(false), lut(NULL)
239  {
240  }
241 
243  {
244  if (lut)
245  delete [] lut;
246  }
247 
248  void Set(uint16_t sym, uint8_t n=0, size_t bits=0)
249  {
250  if (!lut)
251  lut = new Decoder[256];
252 
253  if (n>8)
254  {
255  lut[bits&0xff].Set(sym, n-8, bits>>8);
256  return;
257  }
258 
259  const int nn = 1<<(8-n);
260 
261  for (int i=0; i<nn; i++)
262  {
263  const uint8_t key = bits | (i<<n);
264 
265  lut[key].symbol = sym;
266  lut[key].isLeaf = true;
267  lut[key].nbits = n;
268  }
269  }
270 
271  void Build(const TreeNode &p, uint64_t bits=0, uint8_t n=0)
272  {
273  if (p.isLeaf)
274  {
275  Set(p.symbol, n, bits);
276  return;
277  }
278 
279  Build(*p.zero, bits, n+1);
280  Build(*p.one, bits | (1<<n), n+1);
281  }
282 
283  Decoder(const TreeNode &p) : symbol(0), isLeaf(false), lut(NULL)
284  {
285  Build(p);
286  }
287 
288  const uint8_t *Decode(const uint8_t *in_ptr, const uint8_t *in_end,
289  uint16_t *out_ptr, const uint16_t *out_end) const
290  {
291  Decoder const *p = this;
292 
293  if (in_ptr==in_end)
294  {
295  while (out_ptr < out_end)
296  *out_ptr++ = p->lut->symbol;
297  return in_ptr;
298  }
299 
300  uint8_t curbit = 0;
301  while (in_ptr<in_end && out_ptr<out_end)
302  {
303  const uint16_t *two = (uint16_t*)in_ptr;
304 
305  const uint8_t curbyte = (*two >> curbit);
306 
307 #ifdef __EXCEPTIONS
308  if (!p->lut)
309  throw std::runtime_error("Unknown bitcode in stream!");
310 #else
311  if (!p->lut)
312  return NULL;
313 #endif
314 
315  p = p->lut + curbyte;
316  if (!p->isLeaf)
317  {
318  in_ptr++;
319  continue;
320  }
321 
322  *out_ptr++ = p->symbol;
323  curbit += p->nbits;
324 
325  p = this;
326 
327  if (curbit>=8)
328  {
329  curbit %= 8;
330  in_ptr++;
331  }
332 
333  }
334 
335  return curbit ? in_ptr+1 : in_ptr;
336  }
337 
338  Decoder(const uint8_t* bufin, int64_t &pindex) : isLeaf(false), lut(NULL)
339  {
340  // FIXME: Sanity check for size missing....
341 
342  // Read the number of entries.
343  size_t count=0;
344  memcpy(&count, bufin + pindex, sizeof(count));
345  pindex += sizeof(count);
346 
347  // Read the entries.
348  for (size_t i=0; i<count; i++)
349  {
350  uint16_t sym;
351  memcpy(&sym, bufin + pindex, sizeof(uint16_t));
352  pindex += sizeof(uint16_t);
353 
354  if (count==1)
355  {
356  Set(sym);
357  break;
358  }
359 
360  uint8_t numbits;
361  memcpy(&numbits, bufin + pindex, sizeof(uint8_t));
362  pindex += sizeof(uint8_t);
363 
364  const uint8_t numbytes = numbytes_from_numbits(numbits);
365 
366 #ifdef __EXCEPTIONS
367  if (numbytes>sizeof(size_t))
368  throw std::runtime_error("Number of bytes for a single symbol exceeds maximum.");
369 #else
370  if (numbytes>sizeof(size_t))
371  {
372  pindex = -1;
373  return;
374  }
375 #endif
376  size_t bits=0;
377  memcpy(&bits, bufin+pindex, numbytes);
378  pindex += numbytes;
379 
380  Set(sym, numbits, bits);
381  }
382  }
383  };
384 
385  inline bool Encode(std::string &bufout, const uint16_t *bufin, size_t bufinlen)
386  {
387  const Encoder encoder(bufin, bufinlen);
388 
389 #ifndef __EXCEPTIONS
390  if (encoder.count==0)
391  return false;
392 #endif
393 
394  bufout.append((char*)&bufinlen, sizeof(size_t));
395  encoder.WriteCodeTable(bufout);
396  encoder.Encode(bufout, bufin, bufinlen);
397 
398  return true;
399  }
400 
401  inline int64_t Decode(const uint8_t *bufin,
402  size_t bufinlen,
403  std::vector<uint16_t> &pbufout)
404  {
405  int64_t i = 0;
406 
407  // Read the number of data bytes this encoding represents.
408  size_t data_count = 0;
409  memcpy(&data_count, bufin, sizeof(size_t));
410  i += sizeof(size_t);
411 
412 
413  pbufout.resize(data_count);
414 
415  const Decoder decoder(bufin, i);
416 
417 #ifndef __EXCEPTIONS
418  if (i==-1)
419  return -1;
420 #endif
421 
422  const uint8_t *in_ptr =
423  decoder.Decode(bufin+i, bufin+bufinlen,
424  pbufout.data(), pbufout.data()+data_count);
425 
426 #ifndef __EXCEPTIONS
427  if (!in_ptr)
428  return -1;
429 #endif
430 
431  return in_ptr-bufin;
432  }
433 };
434 
435 #endif
Encoder(const uint16_t *bufin, size_t bufinlen)
Definition: huffman.h:187
#define MAX_SYMBOLS
Definition: huffman.h:11
void Encode(std::string &out, const uint16_t *bufin, size_t bufinlen) const
Definition: huffman.h:137
int i
Definition: db_dim_client.c:21
TreeNode(TreeNode *n0=0, TreeNode *n1=0)
Definition: huffman.h:43
TreeNode(uint16_t sym, size_t cnt=0)
Definition: huffman.h:37
int64_t Decode(const uint8_t *bufin, size_t bufinlen, std::vector< uint16_t > &pbufout)
Definition: huffman.h:401
void WriteCodeTable(std::string &out) const
Definition: huffman.h:113
const uint8_t * Decode(const uint8_t *in_ptr, const uint8_t *in_end, uint16_t *out_ptr, const uint16_t *out_end) const
Definition: huffman.h:288
bool Encode(std::string &bufout, const uint16_t *bufin, size_t bufinlen)
Definition: huffman.h:385
static unsigned long numbytes_from_numbits(unsigned long numbits)
Definition: huffman.h:17
bool CreateEncoder(const TreeNode *n, size_t bits=0, uint8_t nbits=0)
Definition: huffman.h:87
bool Set(bool &rc, const QString &value)
size_t count
Definition: huffman.h:84
TreeNode * zero
Definition: huffman.h:29
uint16_t symbol
Definition: huffman.h:31
void Set(uint16_t sym, uint8_t n=0, size_t bits=0)
Definition: huffman.h:248
TreeNode * parent
Definition: huffman.h:24
int size
Definition: db_dim_server.c:17
Decoder * lut
Definition: huffman.h:236
uint16_t symbol
Definition: huffman.h:232
uint8_t nbits
Definition: huffman.h:233
Decoder(const TreeNode &p)
Definition: huffman.h:283
size_t count
Definition: huffman.h:34
void Build(const TreeNode &p, uint64_t bits=0, uint8_t n=0)
Definition: huffman.h:271
bool operator()(const TreeNode *hn1, const TreeNode *hn2) const
Definition: huffman.h:67
Decoder(const uint8_t *bufin, int64_t &pindex)
Definition: huffman.h:338
uint32_t counts
Definition: HeadersSQM.h:91
TreeNode * one
Definition: huffman.h:29