You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/text/datasets/uci_housing.py

114 lines
3.8 KiB

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import six
import numpy as np
import paddle
from paddle.io import Dataset
from paddle.dataset.common import _check_exists_and_download
__all__ = ["UCIHousing"]
URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT'
]
class UCIHousing(Dataset):
"""
Implementation of `UCI housing <https://archive.ics.uci.edu/ml/datasets/Housing>`_
dataset
Args:
data_file(str): path to data file, can be set None if
:attr:`download` is True. Default None
mode(str): 'train' or 'test' mode. Default 'train'.
download(bool): whether to download dataset automatically if
:attr:`data_file` is not set. Default True
Returns:
Dataset: instance of UCI housing dataset.
Examples:
.. code-block:: python
import paddle
from paddle.text.datasets import UCIHousing
class SimpleNet(paddle.nn.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
def forward(self, feature, target):
return paddle.sum(feature), target
paddle.disable_static()
uci_housing = UCIHousing(mode='train')
for i in range(10):
feature, target = uci_housing[i]
feature = paddle.to_tensor(feature)
target = paddle.to_tensor(target)
model = SimpleNet()
feature, target = model(feature, target)
print(feature.numpy().shape, target.numpy())
"""
def __init__(self, data_file=None, mode='train', download=True):
assert mode.lower() in ['train', 'test'], \
"mode should be 'train' or 'test', but got {}".format(mode)
self.mode = mode.lower()
self.data_file = data_file
if self.data_file is None:
assert download, "data_file is not set and downloading automatically is disabled"
self.data_file = _check_exists_and_download(data_file, URL, MD5,
'uci_housing', download)
# read dataset into memory
self._load_data()
self.dtype = paddle.get_default_dtype()
def _load_data(self, feature_num=14, ratio=0.8):
data = np.fromfile(self.data_file, sep=' ')
data = data.reshape(data.shape[0] // feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
for i in six.moves.range(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio)
if self.mode == 'train':
self.data = data[:offset]
elif self.mode == 'test':
self.data = data[offset:]
def __getitem__(self, idx):
data = self.data[idx]
return np.array(data[:-1]).astype(self.dtype), \
np.array(data[-1:]).astype(self.dtype)
def __len__(self):
return len(self.data)