Remove dependency on nltk for paddle __init__. (#27388)

* Remove dependency on nltk for paddle __init__. test=develop * Remove nltk.movie_reivew sentiment dataset to remove dependency on nltk. test=develop
5 years ago · 081fb2f963
parent df43905f12
commit 081fb2f963
7 changed files with 0 additions and 478 deletions
--- a/python/paddle/dataset/init.py
+++ b/python/paddle/dataset/init.py
@ -22,7 +22,6 @@ import paddle.dataset.cifar
 import paddle.dataset.movielens
 import paddle.dataset.conll05
 import paddle.dataset.uci_housing
 import paddle.dataset.sentiment
 import paddle.dataset.wmt14
 import paddle.dataset.wmt16
 import paddle.dataset.mq2007
@ -37,7 +36,6 @@ __all__ = [
    'cifar',
    'movielens',
    'conll05',
    'sentiment',
    'uci_housing',
    'wmt14',
    'wmt16',
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@ -1,150 +0,0 @@
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """
 from __future__ import print_function
 import six
 import collections
 from itertools import chain
 import os
 import nltk
 from nltk.corpus import movie_reviews
 import zipfile
 from functools import cmp_to_key
 import paddle.dataset.common
 URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
 MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
 __all__ = ['train', 'test', 'get_word_dict']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 def download_data_if_not_yet():
    """
    Download the data set, if the data set is not download.
    """
    try:
        # download and extract movie_reviews.zip
        paddle.dataset.common.download(
            URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
        path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
        filename = os.path.join(path, 'movie_reviews.zip')
        zip_file = zipfile.ZipFile(filename)
        zip_file.extractall(path)
        zip_file.close()
        # make sure that nltk can find the data
        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
        movie_reviews.categories()
    except LookupError:
        print("Downloading movie_reviews data set, please wait.....")
        nltk.download(
            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
        print("Download data set success.....")
        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
 def get_word_dict():
    """
    Sorted the words by the frequency of words which occur in sample
    :return:
        words_freq_sorted
    """
    words_freq_sorted = list()
    word_freq_dict = collections.defaultdict(int)
    download_data_if_not_yet()
    for category in movie_reviews.categories():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
    words_sort_list = list(six.iteritems(word_freq_dict))
    words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))
    return words_freq_sorted
 def sort_files():
    """
    Sorted the sample for cross reading the sample
    :return:
        files_list
    """
    files_list = list()
    neg_file_list = movie_reviews.fileids('neg')
    pos_file_list = movie_reviews.fileids('pos')
    files_list = list(
        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
    return files_list
 def load_sentiment_data():
    """
    Load the data set
    :return:
        data_set
    """
    data_set = list()
    download_data_if_not_yet()
    words_ids = dict(get_word_dict())
    for sample_file in sort_files():
        words_list = list()
        category = 0 if 'neg' in sample_file else 1
        for word in movie_reviews.words(sample_file):
            words_list.append(words_ids[word.lower()])
        data_set.append((words_list, category))
    return data_set
 def reader_creator(data):
    """
    Reader creator, generate an iterator for data set
    :param data:
        train data set or test data set
    """
    for each in data:
        yield each[0], each[1]
 def train():
    """
    Default training set reader creator
    """
    data_set = load_sentiment_data()
    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
 def test():
    """
    Default test set reader creator
    """
    data_set = load_sentiment_data()
    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
 def fetch():
    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@ -1,58 +0,0 @@
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import nltk
 import paddle.dataset.sentiment as st
 from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
    def test_get_word_dict(self):
        word_dict = st.get_word_dict()[0:10]
        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
        for idx, each in enumerate(word_dict):
            self.assertEqual(each, test_word_list[idx])
        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
    def test_sort_files(self):
        last_label = ''
        for sample_file in st.sort_files():
            current_label = sample_file.split("/")[0]
            self.assertNotEqual(current_label, last_label)
            last_label = current_label
    def test_data_set(self):
        data_set = st.load_sentiment_data()
        last_label = -1
        for each in st.test():
            self.assertNotEqual(each[1], last_label)
            last_label = each[1]
        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
        self.assertEqual(
            len(list(st.test())),
            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
@ -1,42 +0,0 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 TestCases for Dataset,
 including create, config, run, etc.
 """
 from __future__ import print_function
 import numpy as np
 import unittest
 import os
 import paddle
 import zipfile
 import paddle.dataset.common
 URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
 MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
 class TestDatasetSentiment(unittest.TestCase):
    """  TestCases for Sentiment. """
    def test_get_word_dict(self):
        """ Testcase for get_word_dict. """
        words_freq_sorted = paddle.dataset.sentiment.get_word_dict()
        print(words_freq_sorted)
        self.assertTrue(len(words_freq_sorted) == 39768)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/tests/test_dataset_movie_reviews.py
+++ b/python/paddle/tests/test_dataset_movie_reviews.py
@ -1,50 +0,0 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 import numpy as np
 from paddle.text.datasets import *
 class TestMovieReviewsTrain(unittest.TestCase):
    def test_main(self):
        movie_reviews = MovieReviews(mode='train')
        self.assertTrue(len(movie_reviews) == 1600)
        # traversal whole dataset may cost a
        # long time, randomly check 1 sample
        idx = np.random.randint(0, 1600)
        data = movie_reviews[idx]
        self.assertTrue(len(data) == 2)
        self.assertTrue(len(data[0].shape) == 1)
        self.assertTrue(int(data[1]) in [0, 1])
 class TestMovieReviewsTest(unittest.TestCase):
    def test_main(self):
        movie_reviews = MovieReviews(mode='test')
        self.assertTrue(len(movie_reviews) == 400)
        # traversal whole dataset may cost a
        # long time, randomly check 1 sample
        idx = np.random.randint(0, 400)
        data = movie_reviews[idx]
        self.assertTrue(len(data) == 2)
        self.assertTrue(len(data[0].shape) == 1)
        self.assertTrue(int(data[1]) in [0, 1])
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/text/datasets/init.py
+++ b/python/paddle/text/datasets/init.py
@ -16,7 +16,6 @@ from . import conll05
 from . import imdb
 from . import imikolov
 from . import movielens
 from . import movie_reviews
 from . import uci_housing
 from . import wmt14
 from . import wmt16
@ -25,7 +24,6 @@ from .conll05 import *
 from .imdb import *
 from .imikolov import *
 from .movielens import *
 from .movie_reviews import *
 from .uci_housing import *
 from .wmt14 import *
 from .wmt16 import *
@ -34,7 +32,6 @@ __all__ = conll05.__all__ \
          + imdb.__all__ \
          + imikolov.__all__ \
          + movielens.__all__ \
          + movie_reviews.__all__ \
          + uci_housing.__all__ \
          + wmt14.__all__ \
          + wmt16.__all__
--- a/python/paddle/text/datasets/movie_reviews.py
+++ b/python/paddle/text/datasets/movie_reviews.py
@ -1,173 +0,0 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import os
 import six
 import numpy as np
 import collections
 import nltk
 from nltk.corpus import movie_reviews
 import zipfile
 from functools import cmp_to_key
 from itertools import chain
 import paddle
 from paddle.io import Dataset
 __all__ = ['MovieReviews']
 URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
 MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 class MovieReviews(Dataset):
    """
    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        mode(str): 'train' 'test' mode. Default 'train'.
        download(bool): whether auto download cifar dataset if
            :attr:`data_file` unset. Default True.
    Returns:
        Dataset: instance of movie reviews dataset
    Examples:
        .. code-block:: python
            import paddle
            from paddle.text.datasets import MovieReviews
            class SimpleNet(paddle.nn.Layer):
                def __init__(self):
                    super(SimpleNet, self).__init__()
                def forward(self, word, category):
                    return paddle.sum(word), category
            paddle.disable_static()
            movie_reviews = MovieReviews(mode='train')
            for i in range(10):
                word_list, category = movie_reviews[i]
                word_list = paddle.to_tensor(word_list)
                category = paddle.to_tensor(category)
                model = SimpleNet()
                word_list, category = model(word_list, category)
                print(word_list.numpy().shape, category.numpy())
    """
    def __init__(self, mode='train'):
        assert mode.lower() in ['train', 'test'], \
            "mode should be 'train', 'test', but got {}".format(mode)
        self.mode = mode.lower()
        self._download_data_if_not_yet()
        # read dataset into memory
        self._load_sentiment_data()
    def _get_word_dict(self):
        """
        Sorted the words by the frequency of words which occur in sample
        :return:
            words_freq_sorted
        """
        words_freq_sorted = list()
        word_freq_dict = collections.defaultdict(int)
        for category in movie_reviews.categories():
            for field in movie_reviews.fileids(category):
                for words in movie_reviews.words(field):
                    word_freq_dict[words] += 1
        words_sort_list = list(six.iteritems(word_freq_dict))
        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
        for index, word in enumerate(words_sort_list):
            words_freq_sorted.append((word[0], index))
        return words_freq_sorted
    def _sort_files(self):
        """
        Sorted the sample for cross reading the sample
        :return:
            files_list
        """
        files_list = list()
        neg_file_list = movie_reviews.fileids('neg')
        pos_file_list = movie_reviews.fileids('pos')
        files_list = list(
            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
        return files_list
    def _load_sentiment_data(self):
        """
        Load the data set
        :return:
            data_set
        """
        self.data = []
        words_ids = dict(self._get_word_dict())
        for sample_file in self._sort_files():
            words_list = list()
            category = 0 if 'neg' in sample_file else 1
            for word in movie_reviews.words(sample_file):
                words_list.append(words_ids[word.lower()])
            self.data.append((words_list, category))
    def _download_data_if_not_yet(self):
        """
        Download the data set, if the data set is not download.
        """
        try:
            # download and extract movie_reviews.zip
            paddle.dataset.common.download(
                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
            filename = os.path.join(path, 'movie_reviews.zip')
            zip_file = zipfile.ZipFile(filename)
            zip_file.extractall(path)
            zip_file.close()
            # make sure that nltk can find the data
            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
            movie_reviews.categories()
        except LookupError:
            print("Downloading movie_reviews data set, please wait.....")
            nltk.download(
                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
            print("Download data set success.....")
            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
    def __getitem__(self, idx):
        if self.mode == 'test':
            idx += NUM_TRAINING_INSTANCES
        data = self.data[idx]
        return np.array(data[0]), np.array(data[1])
    def __len__(self):
        if self.mode == 'train':
            return NUM_TRAINING_INSTANCES
        else:
            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES