-
Notifications
You must be signed in to change notification settings - Fork 3.5k
t2t transformer model #446
Comments
You need to increase the file_byte_budget. |
@martinpopel At first ,I have not change any parameters ,only added my dataset ,however ,after training the model when I calculate the blue_score ,the performance is very bad,the blue_score is lower than Groundhog translation model base on the same dataset .I do not what to do to improve the performance, |
First, I see you use SpaceID.EN_CHR for both input and output, but actually you don't want character-based translation but rather subwords. I think for two-languages translation (non-multitask) the SpaceID does not matter, but I am not sure. Finally, I am not sure this is the best place to discuss such general knowhow. Github issues should be for reporting bugs, feature request or very specific questions. |
@martinpopel thanks if I want to increase the file_byte_budget.what shoud I do . is it right? do you have count in https://gitter.im/tensor2tensor/Lobby |
No. You'll need to modify the |
Fix tensorflow#446 Added `file_size_budget` as argument to `get_or_generate_vocab`.
* Enhance WMT17 En-Zh task with full dataset. Fix #446 Added `file_size_budget` as argument to `get_or_generate_vocab`. * Made requested Fixes: - Added TranslateEnzhWmt8k problem. - Renamed to TranslateEnzhWmt32k, to reflect target vocab in problem name - Added instructions for manually downloading full dataset.
@schani now ,I want to traning my transformer model wtih corpus chinese-japanese,I have corpus about 10 million,
1 ,generator traing and dev data,the code adding my data in word2def.py , as follows:
from future import absolute_import
from future import division
from future import print_function
import os
import tarfile
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators.translate import character_generator
from tensor2tensor.utils import registry
from tensor2tensor.models import transformer
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import translate
import tensorflow as tf
EOS = text_encoder.EOS_ID
English Word2def datasets
#LOCATION_OF_DATA='/dnn4/dnn4_added/zhangxiaolei/env63zxlpy36tf14/'
_WORD2DEF_TRAIN_DATASETS = [["/training-parallel-jp-ch.tgz",("training/translate_train_jpch.jp","training/translate_train_jpch.ch")]]
_WORD2DEF_TEST_DATASETS = [["/dev-parallel-jp-ch.tgz",("dev/translate_dev_jpch.jp","dev/translate_dev_jpch.ch")]]
@registry.register_problem()
class word2def(translate.TranslateProblem):
"""Problem spec for English word to dictionary definition."""
@Property
def targeted_vocab_size(self):
return 2**16+20000 # 32768
@Property
def vocab_name(self):
return "vocab.jpch"
def generator(self, data_dir, tmp_dir, train):
symbolizer_vocab = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,_WORD2DEF_TRAIN_DATASETS)
datasets = _WORD2DEF_TRAIN_DATASETS if train else WORD2DEF_TEST_DATASETS
tag = "train" if train else "dev"
data_path = translate.compile_data(tmp_dir, datasets,"wmt_jpch_tok%s" % tag)
return translate.token_generator(data_path + ".lang1", data_path + ".lang2",symbolizer_vocab, EOS)
@Property
def input_space_id(self):
return problem.SpaceID.EN_CHR
@Property
def target_space_id(self):
return problem.SpaceID.EN_CHR
@registry.register_hparams
def word2def_hparams(self):
hparams = transformer.transformer_base_single_gpu() # Or whatever you'd like to build off.
hparams.batch_size = 1024
return hparams
""" Problem definition for word to dictionary definition.
"""
from future import absolute_import
from future import division
from future import print_function
import os
import tarfile
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators.translate import character_generator
from tensor2tensor.utils import registry
from tensor2tensor.models import transformer
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import translate
import tensorflow as tf
EOS = text_encoder.EOS_ID
English Word2def datasets
#LOCATION_OF_DATA='/dnn4/dnn4_added/zhangxiaolei/env63zxlpy36tf14/'
_WORD2DEF_TRAIN_DATASETS = [["/training-parallel-jp-ch.tgz",("training/translate_train_jpch.jp","training/translate_train_jpch.ch")]]
_WORD2DEF_TEST_DATASETS = [["/dev-parallel-jp-ch.tgz",("dev/translate_dev_jpch.jp","dev/translate_dev_jpch.ch")]]
@registry.register_problem()
class word2def(translate.TranslateProblem):
"""Problem spec for English word to dictionary definition."""
@Property
def targeted_vocab_size(self):
return 2**16
@Property
def vocab_name(self):
return "vocab.jpch"
def generator(self, data_dir, tmp_dir, train):
symbolizer_vocab = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,_WORD2DEF_TRAIN_DATASETS)
datasets = _WORD2DEF_TRAIN_DATASETS if train else WORD2DEF_TEST_DATASETS
tag = "train" if train else "dev"
data_path = translate.compile_data(tmp_dir, datasets,"wmt_jpch_tok%s" % tag)
return translate.token_generator(data_path + ".lang1", data_path + ".lang2",symbolizer_vocab, EOS)
@Property
def input_space_id(self):
return problem.SpaceID.EN_CHR
@Property
def target_space_id(self):
return problem.SpaceID.EN_CHR
@registry.register_hparams
def word2def_hparams(self):
hparams = transformer.transformer_base_single_gpu() # Or whatever you'd like to build off.
hparams.batch_size = 1024
return hparams
init.py as follows:
#encoding:utf-8
from . import word2def
to genereate data
PROBLEM=word2def
MODEL=transformer
HPARAMS=transformer_base_single_gpu
DATA_DIR=t2t_data
TMP_DIR=t2t_datagen
TRAIN_DIR=t2t_train/$PROBLEM/$MODEL-$HPARAMS
t2t-datagen --data_dir=$DATA_DIR --tmp_dir=$TMP_DIR --problem=$PROBLEM --t2t_usr_dir=/dnn4/dnn4_added/zhangxiaolei/env150zxlpy36-980
now I have a problem why the vocab_size is 2**16 in the code ,however ,at last the vocab_size in the file only 47000
how to expand the size of vocab
The text was updated successfully, but these errors were encountered: