#####################################################################
# Sockeye-recipes Hyperparameter configuration file                 #
#                                                                   #
# Overview:                                                         #
# - "workdir" corresponds a group of preprocessed bitext and models #
#    for a given dataset. Each "workdir" can contain multiple       #
#    "datadir" and "modeldir" if desired                            #
# - "datadir" stores the BPE-preprocessed training and validation   #
#    bitext files                                                   #
# - "modeldir" is generated by Sockeye and stores all training info #
# - "rootdir" is path to your installation of sockeye-recipes,      #
#    e.g. ~/src/sockeye-recipes                                     #
#                                                                   #
# preprocess-bpe.sh:                                                #
# - input: Tokenized bitext for training ("train_tok") and          #
#   and validation ("valid_tok")                                    #
# - output: BPE-preprocessed bitext ("train_bpe", "valid_bpe")      #
#   and vocabulary ("bpe_vocab_src", "bpe_vocab_trg")               #
# - main hyperparameters: number of BPE symbols for source & target #
#                                                                   #
# train.sh:                                                         #
# - input: BPE-preprocessed bitext ("train_bpe", "valid_bpe")       #
# - output: "modeldir", which contains all training info and can    #
#    be used to translate                                           #
# - main hyperparameters: many! see below                           #
#                                                                   #
# translate.sh:                                                     #
# - input: this hyperparam file, which specifies modeldir           #
# - output: resulting target translation of source file             #
#####################################################################


#####################################################################
# (0) General settings (to be modified for each project)            #
#####################################################################

### User-specified directories ###
workdir=__WORKDIR__
modeldir=$workdir/__MODELDIR__
rootdir=__ROOTDIR__
# DESCRIPTION: ts1: Transformer model, small

### Language pair (source and target) ###
# Note: We assume all bitext files contain these as suffices. 
# e.g. $train_tok.$src, $train_tok.$trg refer to the source and target 
src=ru
trg=en


#####################################################################
# (1) preprocess-bpe.sh settings (modify if needed)                 #
#####################################################################

### Number of symbols to use for BPE ###
# Note: we perform source and target BPE separately
# This corresponds to initial source (src) and target (trg) vocab size
bpe_symbols_src=30000
bpe_symbols_trg=30000

### Filename for BPE-processed bitext file ###
# Note: the following default names should be fine for most use cases
datadir=__PATH_TO_DATA__
train_bpe_src=$datadir/generaldomain.train.bpe.$src
valid_bpe_src=$datadir/generaldomain.dev.bpe.$src
train_bpe_trg=$datadir/generaldomain.train.bpe.$trg
valid_bpe_trg=$datadir/generaldomain.dev.bpe.$trg

### Filename for BPE vocabulary ###
# Note: the following two default names should be fine for use cases when training from scratch
# Note: the third line uses a pretrained version, and should be used when just running translate.sh out-of-the-box
#bpe_vocab_src=${train_bpe_src}.bpe_vocab
#bpe_vocab_trg=${train_bpe_trg}.bpe_vocab
bpe_vocab_src=${modeldir}/src.bpe_vocab

#####################################################################
# (2) train.sh settings (modify if needed)                          #
#####################################################################

# Model architecture
encoder="transformer"
decoder="transformer"
num_layers="4:4"
num_embed="512:512"
transformer_model_size=512
transformer_attention_heads=8
transformer_feed_forward_num_hidden=1024

# Regularization
embed_dropout=".0:.0"
label_smoothing=0.1

# Vocabulary
num_words="${bpe_symbols_src}:${bpe_symbols_trg}"
word_min_count="1:1"
max_seq_len="100:100"

# Training configuration
batch_size=4096
optimizer=adam
initial_learning_rate=0.0002
learning_rate_reduce_factor=0.9
learning_rate_reduce_num_not_improved=8
loss="cross-entropy"
seed=13

# Logging and stopping condition
checkpoint_frequency=4000
max_num_checkpoint_not_improved=32
min_num_epochs=0
max_num_epochs=100
max_updates=500000
keep_last_params=5
decode_and_evaluate=50

