42#include "EST_Ngrammar.h"
43#include "EST_Pathname.h"
154 EST_Ngrammar::representation_t representation =
164 EST_String(
"[input file0] [input file1] ... -o [output file]\n")+
165 "-w <ifile> filename containing word list (required)\n"+
166 "-p <ifile> filename containing predictee word list\n"+
167 " (default is to use wordlist given by -w)\n"+
168 "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
169 "-smooth <int> Good-Turing smooth the grammar up to the\n"+
170 " given frequency\n"+
171 "-o <ofile> Output file for constructed ngram\n"+
173 "-input_format <string>\n"+
174 " format of input data (default sentence_per_line)\n"+
175 " may be sentence_per_file, ngram_per_line.\n"+
176 "-otype <string> format of output file, one of cstr_ascii\n"+
177 " cstr_bin or htk_ascii\n"+
178 "-sparse build ngram in sparse representation\n"+
179 "-dense build ngram in dense representation (default)\n"+
181 " build backoff ngram (requires -smooth)\n"+
183 " frequency floor value used with some ngrams\n"+
184 "-freqsmooth <int>\n"+
185 " build frequency backed off smoothed ngram, this\n"+
186 " requires -smooth option\n"+
187 "-trace give verbose outout about build process\n"+
188 "-save_compressed save ngram in gzipped format\n"+
189 "-oov_mode <string>\n"+
190 " what to do about out-of-vocabulary words,\n"+
191 " one of skip_ngram, skip_sentence (default),\n"+
192 " skip_file, or use_oov_marker\n"+
193 "-oov_marker <string>\n"+
194 " special word for oov words (default "+OOV_MARKER+
")\n"+
195 " (use in conjunction with '-oov_mode use_oov_marker'\n"+
198 "-prev_tag <string>\n"+
199 " tag before sentence start\n"+
200 "-prev_prev_tag <string>\n"+
201 " all words before 'prev_tag'\n"+
202 "-last_tag <string>\n"+
203 " after sentence end\n"+
204 "-default_tags use default tags of "+SENTENCE_START_MARKER+
205 ","+SENTENCE_END_MARKER+
" and "+SENTENCE_END_MARKER+
"\n"+
209 if (
al.present(
"-input_format"))
214 if (
al.present(
"-oov_mode"))
220 if(
al.present(
"-oov_marker"))
224 cerr <<
"Error : can only use -oov_marker with '-oov_mode use_oov_marker'" <<
endl;
243 if (
al.present(
"-w"))
246 cerr <<
"build_ngram: Must specify a wordlist with -w" <<
endl;
252 cerr <<
"build_ngram: Could not read wordlist from file "
258 if (
al.present(
"-p"))
263 cerr <<
"Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" <<
endl;
270 cerr <<
"build_ngram: Could not read predictee list from file "
276 if (
al.present(
"-trace"))
279 if (
al.present(
"-o"))
280 out_file =
al.val(
"-o");
284 if (
al.present(
"-default_tags"))
290 wordlist.append(SENTENCE_START_MARKER);
291 wordlist.append(SENTENCE_END_MARKER);
293 if (
al.present(
"-p"))
300 if (
al.present(
"-prev_tag"))
302 if (
al.present(
"-default_tags"))
303 cerr <<
"build_ngram: WARNING : -prev_tag overrides -default_tags"
308 if (
al.present(
"-prev_prev_tag"))
310 if (
al.present(
"-default_tags"))
311 cerr <<
"build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
316 if (
al.present(
"-last_tag"))
318 if (
al.present(
"-default_tags"))
319 cerr <<
"build_ngram: WARNING : -last_tag overrides -default_tags"
327 cerr <<
"build_ngram: ERROR : if any tags are given, ALL must be given"
332 if (
al.present(
"-order"))
333 order =
al.ival(
"-order");
336 cerr <<
"build_ngram: WARNING : No order specified with -order : defaulting to bigram"
341 if (
al.present(
"-otype"))
346 if (
al.present(
"-floor"))
351 if (
al.present(
"-backoff"))
352 if (!
al.present(
"-smooth"))
354 cerr <<
"build_ngram: backoff requires smooth value" <<
endl;
357 if (
al.present(
"-freqsmooth"))
358 if (!
al.present(
"-smooth"))
360 cerr <<
"build_ngram: frequency smooth requires smooth value"
365 if (
al.present(
"-dense"))
366 representation = EST_Ngrammar::dense;
367 else if (
al.present(
"-sparse"))
369 cerr <<
"build_ngram: Sorry, sparse representation is not yet available " <<
endl;
371 representation = EST_Ngrammar::sparse;
373 else if (
al.present(
"-backoff"))
374 representation = EST_Ngrammar::backoff;
376 cerr <<
"build_ngram: Defaulting to dense representation" <<
endl;
378 if (
al.present(
"-p"))
382 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" <<
endl;
390 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" <<
endl;
396 if (
al.present(
"-backoff") )
400 al.ival(
"-backoff"),
al.ival(
"-smooth")))
402 cerr <<
"build_ngram: Failed to build backoff " << order
407 cerr <<
"build_ngram: Built backoff " << order <<
415 cerr <<
"build_ngram: Failed to build " << order <<
"-gram" <<
endl;
420 cerr <<
"build_ngram: Built " << order <<
"-gram" <<
endl;
425 if (
al.present(
"-freqsmooth"))
427 Ngram_freqsmooth(
ngrammar,
al.ival(
"-smooth"),
al.ival(
"-freqsmooth"));
429 else if (
al.present(
"-smooth") && !
al.present(
"-backoff"))
434 cerr <<
"build_ngram: Failed to smooth " << order <<
"-gram" <<
endl;
439 cerr <<
"build_ngram: Good Turing smoothed " << order <<
"-gram" <<
endl;
444 if (
al.present(
"-save_compressed"))
451 if (
tmp.extension() == GZIP_FILENAME_EXTENSION)
453 else if (
tmp.extension() == COMPRESS_FILENAME_EXTENSION)
459 out_file = out_file +
"." + GZIP_FILENAME_EXTENSION;
468 cerr <<
"build_ngram: Failed to compress to file "
477 cerr <<
"build_ngram: Saved in compressed " <<
format
478 <<
" format to " << out_file <<
endl;
482 cerr <<
"build_ngram: Failed to write temporary file "
495 <<
" format to " << out_file <<
endl;
499 cerr <<
"build_ngram: Failed to save " <<
format <<
" format data to "