################################################
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
################################################

[GENERAL]

working-dir = /home/pkoehn/experiment/wmt10-${input-extension}-${output-extension}
moses-script-dir = /home/pkoehn/syntax/hieu_chart/scripts
moses-src-dir = /home/pkoehn/moses
edinburgh-script-dir = /home/pkoehn/edinburgh-scripts/scripts
wmt10-data = /home/pkoehn/statmt/data/europarl-v5

ttable-binarizer = $moses-src-dir/misc/processPhraseTable
decoder = $moses-src-dir/moses-cmd/src/mosesmt.2579

input-tokenizer = "$edinburgh-script-dir/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$edinburgh-script-dir/tokenizer.perl -a -l $output-extension"
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

input-extension = es
output-extension = en
pair-extension = es-en

jobs = 10

#################################################################
# PARALLEL CORPUS PREPARATION: 
# create a tokenized, sentence-aligned corpus, ready for training

[CORPUS]

### tools to use to prepare the data
#
#tokenizer = 
#lowercaser = 

### long sentences are filtered out, since they slow down GIZA++ 
# and are a less reliable source of data. set here the maximum
# length of a sentence
#
max-sentence-length = 80

[CORPUS:europarl]

### command to run to get raw corpus files
#
# get-corpus-script = $europarl-v3/get-parallel-corpus.perl

### raw corpus files (untokenized, but sentence aligned)
# 
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension

### tokenized corpus files (may contain long sentences)
#
#tokenized-stem =

### long sentences are filtered out, since they slow down GIZA++ 
# and are a less reliable source of data. set here the maximum
# length of a sentence
#
#max-sentence-length = 80

### if sentence filtering should be skipped,
# point to the clean training data
#
#clean-stem = 

### if corpus preparation should be skipped,
# point to the prepared training data
#
#lowercased-stem = 

[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension

[CORPUS:un]
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension

#################################################################
# LANGUAGE MODEL TRAINING

[LM]

### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
# 
lm-training = $moses-src-dir/srilm/bin/i686-m64/ngram-count
#lm-training = /home/miles/projects/mt/diskbased-lm-training/lm-estimate-new
settings = "-interpolate -kndiscount -unk"
order = 5

### script to use for binary table format
# (default: no binarization)
#
lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm

### script to create quantized language model format
# (default: no quantization)
# 
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm


### tools to use to prepare the data
#
#tokenizer = 
#lowercaser = 

### each language model to be used has its own section here

[LM:europarl]

### command to run to get raw corpus files
#
#get-corpus-script = "$europarl-v3/get-lm-corpus.perl $output-extension"

### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension

### tokenized corpus files (may contain long sentences)
#
#tokenized-corpus = 

### if corpus preparation should be skipped, 
# point to the prepared language model
#
#lm = 

[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension

[LM:un]
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension

[LM:news]
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled


#################################################################
# INTERPOLATING LANGUAGE MODELS

[INTERPOLATED-LM]

# if multiple language models are used, these may be combined
# by optimizing perplexity on a tuning set
# see, for instance [Koehn and Schwenk, IJCNLP 2008]

### directory that includes srilm binaries
#
srilm-dir = $moses-src-dir/srilm/bin/i686-m64


### script to interpolate language models
# if commented out, no interpolation is performed
#
script = $edinburgh-script-dir/interpolate-lm.perl

### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
raw-tuning = $wmt10-data/dev/news-test2008.$output-extension


#################################################################
# TRANSLATION MODEL TRAINING

[TRAINING]

### training script to be used: either a legacy script or 
# current moses training script (default) 
# 
script = $moses-script-dir/training/train-model.perl

### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
alignment-symmetrization-method = grow-diag-final-and

run-giza-in-parts = 5

### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
lexicalized-reordering = msd-bidirectional-fe

### factored training: specify here which factors used
# if none specified, single factor training is assumed
# (one translation step, surface to surface)
#
#input-factors = word lemma pos morph
#output-factors = word lemma pos
#alignment-factors = "word -> word"
#translation-factors = "word -> word"
#reordering-factors = "word -> word"
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"

### if word alignment (giza symmetrization) should be skipped,
# point to word alignment files
#
# word-alignment = 

#use-berkeley = true
#alignment-symmetrization-method = berkeley
#berkeley-train = $edinburgh-script-dir/berkeley-train.sh
#berkeley-process =  $edinburgh-script-dir/berkeley-process.sh
#berkeley-jar = /home/pkoehn/statmt/project/berkeleyaligner-1.1/berkeleyaligner.jar
#berkeley-java-options = "-server -mx15000m -ea"
#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5

### hierarchical rule set
#
#hierarchical-rule-set = true
score-settings = "--GoodTuring"

### if phrase extraction should be skipped,
# point to stem for extract files
#
# extracted-phrases = 

### if phrase table training should be skipped,
# point to phrase translation table
#
# phrase-translation-table = 

### if reordering table training should be skipped,
# point to reordering table
#
# reordering-table = 

### if training should be skipped, 
# point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

#####################################################
### TUNING: finding good weights for model components

[TUNING]

### instead of tuning with this setting, old weights may be recycled
# specify here an old configuration file with matching weights
#
#weight-config = $working-dir/tuning/moses.weight-reused.ini.1

### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl

### specify the corpus used for tuning 
# it should contain 100s if not 1000s of sentences
#
raw-input = $wmt10-data/dev/news-test2008.$input-extension
# tokenized-input = 
# factorized-input = 
# input =
# 
raw-reference = $wmt10-data/dev/news-test2008.$output-extension
# tokenized-reference = 
# factorized-reference = 
# reference = 

### size of n-best list used (typically 100)
#
nbest = 100

### ranges for weights for random initialization
# if not specified, the tuning script will use generic ranges
# it is not clear, if this matters
#
# lambda = 

### additional flags for the decoder
#
decoder-settings = ""

### if tuning should be skipped, specify this here
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

#########################################################
## TESTING: translating a test set using the tuned system

[TESTING]

### number of jobs (if parallel execution of testing)
#
jobs = 10

decoder-settings = "-mbr -mp -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

[TESTING:newstest2009]

### input data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
raw-input = $wmt10-data/dev/newstest2009.$input-extension
# tokenized-input = 
# factorized-input =
# input = 

#########################################################
## RECASER: restore case, this part only trains the model

[RECASING]

#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm

### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
#tokenized = [LM:europarl:tokenized-corpus]

# recase-config = 

#lm-training = $moses-src-dir/srilm/bin/i686/ngram-count

#######################################################
## TRUECASER: train model to truecase corpora and input

[TRUECASER]

### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl

### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
# tokenized-stem = $working-dir/data/ep+nc

### trained model
#
# truecase-model = 

##################################
## EVALUATION: score system output

[EVALUATION]

### prepare system output for scoring 
# this may include detokenization and wrapping output in sgm 
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$edinburgh-script-dir/detokenizer.perl -l $output-extension"
#recaser = $moses-script-dir/recaser/recase.perl
wrapping-script = "$edinburgh-script-dir/wrap-xml.perl $output-extension"
# output-sgm = 

### should output be scored case-sensitive (default: no)?
#
# case-sensitive = yes

### BLEU
#
nist-bleu = $edinburgh-script-dir/mteval-v11b.pl
nist-bleu-c = "$edinburgh-script-dir/mteval-v11b.pl -c"
# multi-bleu = $edinburgh-script-dir/multi-bleu.perl
# ibm-bleu =

### TER: translation error rate (BBN metric) based on edit distance
#
# ter = $edinburgh-script-dir/tercom_v6a.pl

### METEOR: gives credit to stem / worknet synonym matches
#
# meteor = 

[EVALUATION:newstest2009]

### input and reference data
#
raw-input = $TESTING:newstest2009:raw-input
# tokenized-input = 
# factorized-input =
# input = 
# wrapped-reference
# raw-reference =
# tokenized-reference = 
# reference = 
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
wrapping-frame = $input-sgm
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm

[REPORTING]

### what to do with result (default: store in file evaluation/report)
# 
# email = pkoehn@inf.ed.ac.uk

