FACT++  1.0
zfits.cc
Go to the documentation of this file.
1 #include <fstream>
2 #include <iostream>
3 #include <algorithm>
4 #include <map>
5 #include <vector>
6 
7 #include "Configuration.h"
8 
9 #include "externals/fits.h"
10 #include "externals/huffman.h"
11 
12 #include "Time.h"
13 
14 using namespace std;
15 
17 {
18  po::options_description control("zfits");
19  control.add_options()
20  ("in", var<string>()->required(), "")
21  ("out", var<string>(), "")
22  ("decompress,d", po_switch(), "")
23  ("force,f", var<string>(), "Force overwrite of output file")
24  ;
25 
26  po::positional_options_description p;
27  p.add("in", 1); // The 1st positional options
28  p.add("out", 2); // The 2nd positional options
29 
30  conf.AddOptions(control);
31  conf.SetArgumentPositions(p);
32 }
33 
34 void PrintUsage()
35 {
36  cout <<
37  "zfits - A fits compressor\n"
38  "\n"
39  "\n"
40  "Usage: zfits [-d] input.fits[.gz] [output.zf]\n";
41  cout << endl;
42 }
43 
44 
45 string ReplaceEnd(const string &str, const string &expr, const string &repl)
46 {
47  string out(str);
48 
49  const size_t p = out.rfind(expr);
50  if (p==out.size()-expr.length())
51  out.replace(p, expr.length(), repl);
52 
53  return out;
54 }
55 
56 
57 string ReplaceExt(const string &name, bool decomp)
58 {
59  if (decomp)
60  return ReplaceEnd(name, ".zfits", ".fits");
61 
62  string out = ReplaceEnd(name, ".fits", ".zfits");
63  return ReplaceEnd(out, ".fits.gz", ".zfits");
64 }
65 
66 int Compress(const string &ifile, const string &ofile)
67 {
68  // when to print some info on the screen (every f percent)
69  float frac = 0.01;
70 
71  // open a fits file
72  fits f(ifile);
73 
74  // open output file
75  ofstream fout(ofile);
76 
77  // counters for total size and compressed size
78  uint64_t tot = 0;
79  uint64_t com = 0;
80 
81  // very simple timer
82  double sec = 0;
83 
84  // Produce a lookup table with all informations about the
85  // columns in the same order as they are in the file
86  const fits::Table::Columns &cols= f.GetColumns();
87 
88  struct col_t : fits::Table::Column
89  {
90  string name;
91  void *ptr;
92  };
93 
94 
95  map<size_t, col_t> columns;
96 
97  size_t row_tot = 0;
98  for (auto it=cols.begin(); it!=cols.end(); it++)
99  {
100  col_t c;
101 
102  c.offset = it->second.offset;
103  c.size = it->second.size;
104  c.num = it->second.num;
105  c.name = it->first;
106  c.ptr = f.SetPtrAddress(it->first);
107 
108  columns[c.offset] = c;
109 
110  row_tot += c.size*c.num;
111  }
112 
113  // copy the header from the input to the output file
114  // and prefix the output file as a compressed fits file
115  string header;
116  header.resize(f.tellg());
117 
118  f.seekg(0);
119  f.read((char*)header.c_str(), header.size());
120 
121  char m[2];
122  m[0] = 'z'+128;
123  m[1] = 'f'+128;
124 
125  const size_t hlen = 0;
126 
127  size_t hs = header.size();
128 
129  fout.write(m, 2); // magic number
130  fout.write((char*)&hlen, sizeof(size_t)); // length of possible header data (e.g. file version, compression algorithm)
131  fout.write((char*)&hs, sizeof(size_t)); // size of FITS header
132  fout.write(header.c_str(), header.size()); // uncompressed FITS header
133 
134  tot += header.size();
135  com += header.size()+2+2*sizeof(size_t);
136 
137  cout << fixed;
138 
139  Time start;
140 
141  // loop over all rows
142  vector<char> cache(row_tot);
143  while (f.GetNextRow())
144  {
145  // pointer to the start of the cache for the data of one row
146  char *out = cache.data();
147 
148  // mask stroing which column have been compressed and which not
149  vector<uint8_t> mask(cols.size()/8 + 1);
150 
151  // loop over all columns
152  uint32_t icol = 0;
153  for (auto it=columns.begin(); it!=columns.end(); it++, icol++)
154  {
155  // size of cell in bytes
156  const size_t len_col = it->second.size * it->second.num;
157 
158  // get pointer to data
159  int16_t *ptr = (int16_t*)it->second.ptr;
160 
161  // If the column is the data, preprocess the data
162  /*
163  if (it->second.name=="Data")
164  {
165  int16_t *end = ptr+1440*300-4-(1440*300)%2;
166  int16_t *beg = ptr;
167 
168  while (end>=beg)
169  {
170  const int16_t avg = (end[0] + end[1])/2;
171  end[2] -= avg;
172  end[3] -= avg;
173  end -=2;
174  }
175  }*/
176 
177  // do not try to compress less than 32bytes
178  if (len_col>32 && it->second.size==2)
179  {
180  Time now;
181 
182  // perform 16bit hoffman (option for 8bit missing, skip 64bit)
183  // (what to do with floats?)
184  string buf;
185  /*int len =*/ Huffman::Encode(buf, (uint16_t*)ptr, len_col/2);
186 
187  sec += Time().UnixTime()-now.UnixTime();
188 
189  // check if data was really compressed
190  if (buf.size()<len_col)
191  {
192  // copy compressed data into output cache
193  memcpy(out, buf.c_str(), buf.size());
194  out += buf.size();
195 
196  // update mask
197  const uint64_t bit = (icol%8);
198  mask[icol/8] |= (1<<bit);
199 
200  continue;
201  }
202  }
203 
204  // just copy the data if it has not been compressed
205  memcpy(out, (char*)ptr, len_col);
206  out += len_col;
207  }
208 
209  // calcualte size of output buffer
210  const size_t sz = out-cache.data();
211 
212  // update counters
213  tot += row_tot;
214  com += sz + mask.size();
215 
216  // write the compression mask and the (partly) copmpressed data stream
217  fout.write((char*)mask.data(), mask.size());
218  fout.write(cache.data(), sz);
219 
220  //if (sz2<0 || memcmp(data, dest3.data(), 432000*2)!=0)
221  // cout << "grrrr" << endl;
222 
223  const float proc = float(f.GetRow())/f.GetNumRows();
224  if (proc>frac)
225  {
226  const double elep = Time().UnixTime()-start.UnixTime();
227  cout << "\r" << setprecision(0) << setw(3) << 100*proc << "% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s in:" << tot/1000000/elep << "MB/s" << flush;
228  frac += 0.01;
229  }
230  }
231 
232  const double elep = Time().UnixTime()-start.UnixTime();
233  cout << setprecision(0) << "\r100% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s in:" << tot/1000000/elep << "MB/s" << endl;
234 
235  return 0;
236 }
237 
238 template<size_t N>
239 void revcpy(char *dest, const char *src, int num)
240 {
241  const char *pend = src + num*N;
242  for (const char *ptr = src; ptr<pend; ptr+=N, dest+=N)
243  reverse_copy(ptr, ptr+N, dest);
244 }
245 
246 int Decompress(const string &ifile, const string &ofile)
247 {
248  // open a fits file
249  ifstream fin(ifile);
250 
251  // open output file
252  ofstream fout(ofile);
253 
254  // get and check magic number
255  unsigned char m[2];
256  fin.read((char*)m, 2);
257  if (m[0]!='z'+128 || m[1]!='f'+128)
258  throw runtime_error("File not a compressed fits file.");
259 
260  // get length of additional header information
261  size_t hlen = 0;
262  fin.read((char*)&hlen, sizeof(size_t));
263  if (hlen>0)
264  throw runtime_error("Only Version-zero files supported.");
265 
266  // get size of FITS header
267  size_t hs = 0;
268  fin.read((char*)&hs, sizeof(size_t));
269  if (!fin)
270  throw runtime_error("Could not access header size.");
271 
272  // copy the header from the input to the output file
273  // and prefix the output file as a compressed fits file
274  string header;
275  header.resize(hs);
276 
277  fin.read((char*)header.c_str(), header.size());
278  fout.write((char*)header.c_str(), header.size());
279  if (!fin)
280  throw runtime_error("Could not read full header");
281 
282  string templ("tmpXXXXXX");
283  int fd = mkstemp((char*)templ.c_str());
284  const ssize_t rc = write(fd, header.c_str(), header.size());
285  close(fd);
286 
287  if (rc<0)
288  throw runtime_error("Could not write to temporary file: "+string(strerror(errno)));
289 
290  // open the output file to get the header parsed
291  fits info(templ);
292 
293  remove(templ.c_str());
294 
295  // get the maximum size of one row and a list
296  // of all columns ordered by their offset
297  size_t row_tot = 0;
298  const fits::Table::Columns &cols = info.GetColumns();
299  map<size_t, fits::Table::Column> columns;
300  for (auto it=cols.begin(); it!=cols.end(); it++)
301  {
302  columns[it->second.offset] = it->second;
303  row_tot += it->second.num*it->second.size;
304  }
305 
306  // very simple timer
307  double sec = 0;
308  double frac = 0;
309 
310  size_t com = 2+hs+2*sizeof(size_t);
311  size_t tot = hs;
312 
313  const size_t masklen = cols.size()/8+1;
314 
315  // loop over all rows
316  vector<char> buf(row_tot+masklen);
317  vector<char> swap(row_tot);
318  uint32_t offset = 0;
319 
320  const uint64_t nrows = info.GetUInt("NAXIS2");
321 
322  Time start;
323 
324  cout << fixed;
325  for (uint32_t irow=0; irow<nrows; irow++)
326  {
327  fin.read(buf.data()+offset, buf.size()-offset);
328 
329  const uint8_t *mask = reinterpret_cast<uint8_t*>(buf.data());
330  offset = masklen;
331 
332  char *ptr = swap.data();
333 
334  uint32_t icol = 0;
335  for (auto it=columns.begin(); it!= columns.end(); it++, icol++)
336  {
337  const size_t &num = it->second.num;
338  const size_t &size = it->second.size;
339 
340  if (mask[icol/8]&(1<<(icol%8)))
341  {
342  Time now;
343 
344  vector<uint16_t> out(num*size/2);
345  int len = Huffman::Decode((uint8_t*)buf.data()+offset, buf.size()-offset, out);
346  if (len<0)
347  throw runtime_error("Decoding failed.");
348 
349  sec += Time().UnixTime()-now.UnixTime();
350 
351  offset += len;
352 
353  revcpy<2>(ptr, (char*)out.data(), num);
354  }
355  else
356  {
357  switch (size)
358  {
359  case 1: memcpy (ptr, buf.data()+offset, num*size); break;
360  case 2: revcpy<2>(ptr, buf.data()+offset, num); break;
361  case 4: revcpy<4>(ptr, buf.data()+offset, num); break;
362  case 8: revcpy<8>(ptr, buf.data()+offset, num); break;
363  }
364 
365  offset += num*size;
366  }
367 
368  ptr += num*size;
369  }
370 
371  com += offset+masklen;
372  tot += row_tot;
373 
374  fout.write((char*)swap.data(), swap.size());
375 
376  memmove(buf.data(), buf.data()+offset, buf.size()-offset);
377  offset = buf.size()-offset;
378 
379  if (!fout)
380  throw runtime_error("Error writing to output file");
381 
382  const float proc = float(irow)/nrows;
383  if (proc>frac)
384  {
385  const double elep = Time().UnixTime()-start.UnixTime();
386  cout << "\r" << setprecision(0) << setw(3) << 100*proc << "% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s out:" << tot/1000000/elep << "MB/s" << flush;
387  frac += 0.01;
388  }
389  }
390 
391  const double elep = Time().UnixTime()-start.UnixTime();
392  cout << setprecision(0) << "\r100% [" << setprecision(1) << setw(5) << 100.*com/tot << "%] cpu:" << sec << "s out:" << tot/1000000/elep << "MB/s" << endl;
393 
394  return 0;
395 }
396 
397 int main(int argc, const char **argv)
398 {
399  Configuration conf(argv[0]);
401  SetupConfiguration(conf);
402 
403  if (!conf.DoParse(argc, argv))
404  return 127;
405 
406  const bool decomp = conf.Get<bool>("decompress");
407 
408  const string ifile = conf.Get<string>("in");
409  const string ofile = conf.Has("out") ? conf.Get<string>("out") : ReplaceExt(ifile, decomp);
410 
411  return decomp ? Decompress(ifile, ofile) : Compress(ifile, ofile);
412 
413  /*
414  // reading and writing files which just contain the binary data
415  // For simplicity I assume ROI=300
416 
417  ifstream finx( "20130117_082.fits.pu");
418  ofstream foutx("20130117_082.fits.puz");
419 
420  while (1)
421  {
422  string str;
423  str.resize(432000*2);
424 
425  finx.read((char*)str.c_str(), 432000*2);
426  if (!finx)
427  break;
428 
429  // Preprocess the data, e.g. subtract median pixelwise
430  for (int i=0; i<1440; i++)
431  {
432  int16_t *chunk = (int16_t*)str.c_str()+i*300;
433  sort(chunk, chunk+300);
434 
435  int16_t med = chunk[149];
436 
437  for (int j=0; j<300; j++)
438  chunk[j] -= med;
439  }
440 
441  // do huffman encoding on shorts
442  string buf;
443  int len = huffmans_encode(buf, (uint16_t*)str.c_str(), 432000);
444 
445  // if the result is smaller than the original data write
446  // the result, otherwise the original data
447  if (buf.size()<432000*2)
448  foutx.write(buf.c_str(), buf.size());
449  else
450  foutx.write(str.c_str(), 432000);
451  }
452 
453  return 0;
454  */
455 }
int start(int initState)
Definition: feeserver.c:1740
void SetupConfiguration(Configuration &conf)
Definition: zfits.cc:16
void * SetPtrAddress(const std::string &name)
Definition: fits.h:886
int Compress(const string &ifile, const string &ofile)
Definition: zfits.cc:66
std::map< std::string, Column > Columns
Definition: fits.h:113
uint64_t GetUInt(const std::string &key) const
Definition: fits.h:1009
void PrintUsage()
Definition: zfits.cc:34
Adds some functionality to boost::posix_time::ptime for our needs.
Definition: Time.h:30
char str[80]
Definition: test_client.c:7
void SetPrintUsage(const std::function< void(void)> &func)
T Get(const std::string &var)
po::typed_value< bool > * po_switch()
STL namespace.
int64_t Decode(const uint8_t *bufin, size_t bufinlen, std::vector< uint16_t > &pbufout)
Definition: huffman.h:401
string ReplaceEnd(const string &str, const string &expr, const string &repl)
Definition: zfits.cc:45
void SetArgumentPositions(const po::positional_options_description &desc)
virtual size_t GetNumRows() const
Definition: fits.h:1031
int main(int argc, const char **argv)
Definition: zfits.cc:397
void revcpy(char *dest, const char *src, int num)
Definition: zfits.cc:239
bool Encode(std::string &bufout, const uint16_t *bufin, size_t bufinlen)
Definition: huffman.h:385
bool GetNextRow(bool check=true)
Definition: fits.h:851
bool Has(const std::string &var)
Definition: fits.h:54
void AddOptions(const po::options_description &opt, bool visible=true)
Definition: Configuration.h:92
string ReplaceExt(const string &name, bool decomp)
Definition: zfits.cc:57
double UnixTime() const
Definition: Time.cc:195
Commandline parsing, resource file parsing and database access.
Definition: Configuration.h:9
int size
Definition: db_dim_server.c:17
if(extraDns) new Dns
const Table::Columns & GetColumns() const
Definition: fits.h:1004
int Decompress(const string &ifile, const string &ofile)
Definition: zfits.cc:246
bool DoParse(int argc, const char **argv, const std::function< void()> &func=std::function< void()>())
virtual bool GetRow(size_t row, bool check=true)
Definition: fits.h:827