subword好像在tensorflow2.3被弃用了
来源:10-15 数据预处理与dataset生成
qq_慕前端4252840
2021-08-05
用keras里的tokenizer重写了生成数据集这部分的代码,大家可以参考一下:
import tensorflow_datasets as tfds
examples,info=tfds.load('ted_hrlr_translate/pt_to_en',with_info=True,
as_supervised=True)
train_examples,val_examples=examples['train'],examples['validation']
en_tokenizer=keras.preprocessing.text.Tokenizer(num_words=None,filters='',
split=' ')
en_tokenizer.fit_on_texts(repr(en.numpy()).split(' ') for pt,en in
train_examples)
pt_tokenizer=keras.preprocessing.text.Tokenizer(num_words=None,
filters='',split=' ')
pt_tokenizer.fit_on_texts(repr(pt.numpy()).split(' ') for pt,en in
train_examples)
buffer_size=20000
batch_size=64
max_length=40
def encode_to_id(pt_sentence,en_sentence):
pt_sentence=pt_tokenizer.texts_to_sequences(repr(pt_sentence.numpy()).split(' '))
pt_sentence=tf.convert_to_tensor(pt_sentence,dtype=tf.int64)
pt_sentence=tf.reshape(pt_sentence,shape=(len(pt_sentence),))
en_sentence=en_tokenizer.texts_to_sequences(repr(en_sentence.numpy()).split(' '))
en_sentence=tf.convert_to_tensor(en_sentence,dtype=tf.int64)
en_sentence=tf.reshape(en_sentence,shape=(len(en_sentence),))
return pt_sentence,en_sentence
def filter_by_maxlen(pt,en):
return tf.logical_and(tf.size(pt)<=max_length,tf.size(en)<=max_length)
def tf_encode_to_id(pt_sentence,en_sentence):
return tf.py_function(encode_to_id,[pt_sentence,en_sentence],[tf.int64,tf.int64])
train_dataset=train_examples.map(tf_encode_to_id)
train_dataset=train_dataset.filter(filter_by_maxlen)
train_dataset=train_dataset.shuffle(buffer_size).padded_batch(batch_size,padded_shapes=([-1],[-1]))
写回答
1回答
-
是的,这个API在tensorflow_datasets里被遗弃了,https://www.tensorflow.org/datasets/api_docs/python/tfds/deprecated/text/SubwordTextEncoder
看这里的描述:https://github.com/tensorflow/datasets/issues/2879
这个功能是被移到了tensorflow_text里,https://www.tensorflow.org/text
tensorflow_text里就有subword的example:
import tensorflow as tf import tensorflow_text as tf_text def preprocess(vocab_table, example_text): # Normalize text tf_text.normalize_utf8(example_text) # Tokenize into words word_tokenizer = tf_text.WhitespaceTokenizer() tokens = word_tokenizer.tokenize(example_text) # Tokenize into subwords subword_tokenizer = tf_text.WordpieceTokenizer( lookup_table, token_out_type=tf.int64) subtokens = subword_tokenizer.tokenize(tokens).merge_dims(1, -1) # Apply padding padded_inputs = tf_text.pad_model_inputs(subtokens, max_seq_length=16) return padded_inputs
10
相似问题