diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py index d1d4c3bf160b..eedeb3d9e225 100644 --- a/paddlenlp/transformers/ernie/tokenizer.py +++ b/paddlenlp/transformers/ernie/tokenizer.py @@ -27,8 +27,46 @@ __all__ = ['ErnieTokenizer', 'ErnieTinyTokenizer'] PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "ernie-doc-base-en": 512, - "ernie-doc-base-zh": 512 + "ernie-1.0": 513, + "ernie-1.0-base-zh": 513, + "ernie-1.0-base-zh-cw": 512, + "ernie-1.0-large-zh-cw": 512, + "ernie-tiny": 600, + "ernie-2.0-base-zh": 513, + "ernie-2.0-large-zh": 512, + "ernie-2.0-base-en": 512, + "ernie-2.0-base-en-finetuned-squad": 512, + "ernie-2.0-large-en": 512, + "ernie-gen-base-en": 1024, + "ernie-gen-large-en": 1024, + "ernie-gen-large-en-430g": 1024, + "rocketqa-zh-dureader-query-encoder": 513, + "rocketqa-zh-dureader-para-encoder": 513, + "rocketqa-v1-marco-query-encoder": 512, + "rocketqa-v1-marco-para-encoder": 512, + "rocketqa-zh-dureader-cross-encoder": 513, + "rocketqa-v1-marco-cross-encoder": 512, + "ernie-3.0-base-zh": 2048, + "ernie-3.0-xbase-zh": 2048, + "ernie-3.0-medium-zh": 2048, + "ernie-3.0-mini-zh": 2048, + "ernie-3.0-micro-zh": 2048, + "ernie-3.0-nano-zh": 2048, + "rocketqa-zh-base-query-encoder": 2048, + "rocketqa-zh-base-para-encoder": 2048, + "rocketqa-zh-medium-query-encoder": 2048, + "rocketqa-zh-medium-para-encoder": 2048, + "rocketqa-zh-mini-query-encoder": 2048, + "rocketqa-zh-mini-para-encoder": 2048, + "rocketqa-zh-micro-query-encoder": 2048, + "rocketqa-zh-micro-para-encoder": 2048, + "rocketqa-zh-nano-query-encoder": 2048, + "rocketqa-zh-nano-para-encoder": 2048, + "rocketqa-base-cross-encoder": 2048, + "rocketqa-medium-cross-encoder": 2048, + "rocketqa-mini-cross-encoder": 2048, + "rocketqa-micro-cross-encoder": 2048, + "rocketqa-nano-cross-encoder": 2048 } @@ -288,48 +326,6 @@ class ErnieTokenizer(PretrainedTokenizer): "do_lower_case": True }, } - max_model_input_sizes = { - "ernie-1.0": 513, - "ernie-1.0-base-zh": 513, - "ernie-1.0-base-zh-cw": 512, - "ernie-1.0-large-zh-cw": 512, - "ernie-tiny": 600, - "ernie-2.0-base-zh": 513, - "ernie-2.0-large-zh": 512, - "ernie-2.0-base-en": 512, - "ernie-2.0-base-en-finetuned-squad": 512, - "ernie-2.0-large-en": 512, - "ernie-gen-base-en": 1024, - "ernie-gen-large-en": 1024, - "ernie-gen-large-en-430g": 1024, - "rocketqa-zh-dureader-query-encoder": 513, - "rocketqa-zh-dureader-para-encoder": 513, - "rocketqa-v1-marco-query-encoder": 512, - "rocketqa-v1-marco-para-encoder": 512, - "rocketqa-zh-dureader-cross-encoder": 513, - "rocketqa-v1-marco-cross-encoder": 512, - "ernie-3.0-base-zh": 2048, - "ernie-3.0-xbase-zh": 2048, - "ernie-3.0-medium-zh": 2048, - "ernie-3.0-mini-zh": 2048, - "ernie-3.0-micro-zh": 2048, - "ernie-3.0-nano-zh": 2048, - "rocketqa-zh-base-query-encoder": 2048, - "rocketqa-zh-base-para-encoder": 2048, - "rocketqa-zh-medium-query-encoder": 2048, - "rocketqa-zh-medium-para-encoder": 2048, - "rocketqa-zh-mini-query-encoder": 2048, - "rocketqa-zh-mini-para-encoder": 2048, - "rocketqa-zh-micro-query-encoder": 2048, - "rocketqa-zh-micro-para-encoder": 2048, - "rocketqa-zh-nano-query-encoder": 2048, - "rocketqa-zh-nano-para-encoder": 2048, - "rocketqa-base-cross-encoder": 2048, - "rocketqa-medium-cross-encoder": 2048, - "rocketqa-mini-cross-encoder": 2048, - "rocketqa-micro-cross-encoder": 2048, - "rocketqa-nano-cross-encoder": 2048, - } max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES