deepfmnet test case

chenged dataset(dataset in H5 format) add tables labrary
5 years ago · 9ebf8e2362
parent d113e7a694
commit 9ebf8e2362
9 changed files with 1154 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -17,3 +17,4 @@ bs4
 astunparse
 packaging >= 20.0
 pycocotools >= 2.0.0        # for st test
+tables >= 3.6.1             # for st test
--- a/tests/st/model_zoo_tests/DeepFM/init.py
+++ b/tests/st/model_zoo_tests/DeepFM/init.py
--- a/tests/st/model_zoo_tests/DeepFM/process_data.py
+++ b/tests/st/model_zoo_tests/DeepFM/process_data.py
@ -0,0 +1,233 @@
+# coding:utf-8
+import os
+import pickle
+import collections
+import argparse
+import numpy as np
+import pandas as pd
+TRAIN_LINE_COUNT = 45840617
+TEST_LINE_COUNT = 6042135
+
+
+class DataStatsDict():
+    def __init__(self):
+        self.field_size = 39  # value_1-13;  cat_1-26;
+        self.val_cols = ["val_{}".format(i + 1) for i in range(13)]
+        self.cat_cols = ["cat_{}".format(i + 1) for i in range(26)]
+        #
+        self.val_min_dict = {col: 0 for col in self.val_cols}
+        self.val_max_dict = {col: 0 for col in self.val_cols}
+        self.cat_count_dict = {col: collections.defaultdict(int) for col in self.cat_cols}
+        #
+        self.oov_prefix = "OOV_"
+        self.cat2id_dict = {}
+        self.cat2id_dict.update({col: i for i, col in enumerate(self.val_cols)})
+        self.cat2id_dict.update({self.oov_prefix + col: i + len(self.val_cols) for i, col in enumerate(self.cat_cols)})
+        # { "val_1": , ..., "val_13": ,  "OOV_cat_1": , ..., "OOV_cat_26": }
+
+    def stats_vals(self, val_list):
+        assert len(val_list) == len(self.val_cols)
+        def map_max_min(i, val):
+            key = self.val_cols[i]
+            if val != "":
+                if float(val) > self.val_max_dict[key]:
+                    self.val_max_dict[key] = float(val)
+                if float(val) < self.val_min_dict[key]:
+                    self.val_min_dict[key] = float(val)
+        for i, val in enumerate(val_list):
+            map_max_min(i, val)
+
+    def stats_cats(self, cat_list):
+        assert len(cat_list) == len(self.cat_cols)
+        def map_cat_count(i, cat):
+            key = self.cat_cols[i]
+            self.cat_count_dict[key][cat] += 1
+        for i, cat in enumerate(cat_list):
+            map_cat_count(i, cat)
+    #
+    def save_dict(self, output_path, prefix=""):
+        with open(os.path.join(output_path, "{}val_max_dict.pkl".format(prefix)), "wb") as file_wrt:
+            pickle.dump(self.val_max_dict, file_wrt)
+        with open(os.path.join(output_path, "{}val_min_dict.pkl".format(prefix)), "wb") as file_wrt:
+            pickle.dump(self.val_min_dict, file_wrt)
+        with open(os.path.join(output_path, "{}cat_count_dict.pkl".format(prefix)), "wb") as file_wrt:
+            pickle.dump(self.cat_count_dict, file_wrt)
+
+    def load_dict(self, dict_path, prefix=""):
+        with open(os.path.join(dict_path, "{}val_max_dict.pkl".format(prefix)), "rb") as file_wrt:
+            self.val_max_dict = pickle.load(file_wrt)
+        with open(os.path.join(dict_path, "{}val_min_dict.pkl".format(prefix)), "rb") as file_wrt:
+            self.val_min_dict = pickle.load(file_wrt)
+        with open(os.path.join(dict_path, "{}cat_count_dict.pkl".format(prefix)), "rb") as file_wrt:
+            self.cat_count_dict = pickle.load(file_wrt)
+        print("val_max_dict.items()[:50]: {}".format(list(self.val_max_dict.items())))
+        print("val_min_dict.items()[:50]: {}".format(list(self.val_min_dict.items())))
+
+    def get_cat2id(self, threshold=100):
+        for key, cat_count_d in self.cat_count_dict.items():
+            new_cat_count_d = dict(filter(lambda x: x[1] > threshold, cat_count_d.items()))
+            for cat_str, _ in new_cat_count_d.items():
+                self.cat2id_dict[key + "_" + cat_str] = len(self.cat2id_dict)
+        # print("before_all_count: {}".format( before_all_count )) # before_all_count: 33762577
+        # print("after_all_count: {}".format( after_all_count )) # after_all_count: 184926
+        print("cat2id_dict.size: {}".format(len(self.cat2id_dict)))
+        print("cat2id_dict.items()[:50]: {}".format(list(self.cat2id_dict.items())[:50]))
+
+    def map_cat2id(self, values, cats):
+        def minmax_scale_value(i, val):
+            # min_v = float(self.val_min_dict[ "val_{}".format(i+1) ])
+            max_v = float(self.val_max_dict["val_{}".format(i + 1)])
+            # return ( float(val) - min_v ) * 1.0 / (max_v - min_v)
+            return float(val) * 1.0 / max_v
+        id_list = []
+        weight_list = []
+        for i, val in enumerate(values):
+            if val == "":
+                id_list.append(i)
+                weight_list.append(0)
+            else:
+                key = "val_{}".format(i + 1)
+                id_list.append(self.cat2id_dict[key])
+                weight_list.append(minmax_scale_value(i, float(val)))
+
+        for i, cat_str in enumerate(cats):
+            key = "cat_{}".format(i + 1) + "_" + cat_str
+            if key in self.cat2id_dict:
+                id_list.append(self.cat2id_dict[key])
+            else:
+                id_list.append(self.cat2id_dict[self.oov_prefix + "cat_{}".format(i + 1)])
+            weight_list.append(1.0)
+        return id_list, weight_list
+
+def mkdir_path(file_path):
+    if not os.path.exists(file_path):
+        os.makedirs(file_path)
+
+def statsdata(data_source_path, output_path, data_stats1):
+    with open(data_source_path, encoding="utf-8") as file_in:
+        errorline_list = []
+        count = 0
+        for line in file_in:
+            count += 1
+            line = line.strip("\n")
+            items = line.split("\t")
+            if len(items) != 40:
+                errorline_list.append(count)
+                print("line: {}".format(line))
+                continue
+            if count % 1000000 == 0:
+                print("Have handle {}w lines.".format(count // 10000))
+            values = items[1:14]
+            cats = items[14:]
+            assert len(values) == 13, "values.size： {}".format(len(values))
+            assert len(cats) == 26, "cats.size： {}".format(len(cats))
+            data_stats1.stats_vals(values)
+            data_stats1.stats_cats(cats)
+    data_stats1.save_dict(output_path)
+
+def add_write(file_path, wrt_str):
+    with open(file_path, 'a', encoding="utf-8") as file_out:
+        file_out.write(wrt_str + "\n")
+
+def random_split_trans2h5(input_file_path, output_path, data_stats2, part_rows=2000000, test_size=0.1, seed=2020):
+    test_size = int(TRAIN_LINE_COUNT * test_size)
+
+    all_indices = [i for i in range(TRAIN_LINE_COUNT)]
+    np.random.seed(seed)
+    np.random.shuffle(all_indices)
+    print("all_indices.size: {}".format(len(all_indices)))
+    test_indices_set = set(all_indices[: test_size])
+    print("test_indices_set.size: {}".format(len(test_indices_set)))
+    print("----------" * 10 + "\n" * 2)
+
+    train_feature_file_name = os.path.join(output_path, "train_input_part_{}.h5")
+    train_label_file_name = os.path.join(output_path, "train_output_part_{}.h5")
+    test_feature_file_name = os.path.join(output_path, "test_input_part_{}.h5")
+    test_label_file_name = os.path.join(output_path, "test_output_part_{}.h5")
+
+    train_feature_list = []
+    train_label_list = []
+    test_feature_list = []
+    test_label_list = []
+    with open(input_file_path, encoding="utf-8") as file_in:
+        count = 0
+        train_part_number = 0
+        test_part_number = 0
+        for i, line in enumerate(file_in):
+            count += 1
+            if count % 1000000 == 0:
+                print("Have handle {}w lines.".format(count // 10000))
+            line = line.strip("\n")
+            items = line.split("\t")
+            if len(items) != 40:
+                continue
+            label = float(items[0])
+            values = items[1:14]
+            cats = items[14:]
+            assert len(values) == 13, "values.size： {}".format(len(values))
+            assert len(cats) == 26, "cats.size： {}".format(len(cats))
+            ids, wts = data_stats2.map_cat2id(values, cats)
+            if i not in test_indices_set:
+                train_feature_list.append(ids + wts)
+                train_label_list.append(label)
+            else:
+                test_feature_list.append(ids + wts)
+                test_label_list.append(label)
+            if train_label_list and (len(train_label_list) % part_rows == 0):
+                pd.DataFrame(np.asarray(train_feature_list)).to_hdf(train_feature_file_name.format(train_part_number),
+                                                                    key="fixed")
+                pd.DataFrame(np.asarray(train_label_list)).to_hdf(train_label_file_name.format(train_part_number),
+                                                                  key="fixed")
+                train_feature_list = []
+                train_label_list = []
+                train_part_number += 1
+            if test_label_list and (len(test_label_list) % part_rows == 0):
+                pd.DataFrame(np.asarray(test_feature_list)).to_hdf(test_feature_file_name.format(test_part_number),
+                                                                   key="fixed")
+                pd.DataFrame(np.asarray(test_label_list)).to_hdf(test_label_file_name.format(test_part_number),
+                                                                 key="fixed")
+                test_feature_list = []
+                test_label_list = []
+                test_part_number += 1
+
+        if train_label_list:
+            pd.DataFrame(np.asarray(train_feature_list)).to_hdf(train_feature_file_name.format(train_part_number),
+                                                                key="fixed")
+            pd.DataFrame(np.asarray(train_label_list)).to_hdf(train_label_file_name.format(train_part_number),
+                                                              key="fixed")
+
+        if test_label_list:
+            pd.DataFrame(np.asarray(test_feature_list)).to_hdf(test_feature_file_name.format(test_part_number),
+                                                               key="fixed")
+            pd.DataFrame(np.asarray(test_label_list)).to_hdf(test_label_file_name.format(test_part_number), key="fixed")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Get and Process datasets')
+    parser.add_argument('--base_path', default="/home/wushuquan/tmp/", help='The path to save dataset')
+    parser.add_argument('--output_path', default="/home/wushuquan/tmp/h5dataset/",
+                        help='The path to save h5 dataset')
+
+    args, _ = parser.parse_known_args()
+    base_path = args.base_path
+    data_path = base_path + ""
+    # mkdir_path(data_path)
+    # if not os.path.exists(base_path + "dac.tar.gz"):
+        # os.system(
+            # "wget -P {} -c https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz --no-check-certificate".format(
+                # base_path))
+    os.system("tar -zxvf {}dac.tar.gz".format(data_path))
+    print("********tar end***********")
+    data_stats = DataStatsDict()
+
+    # step 1, stats the vocab and normalize value
+    data_file_path = "./train.txt"
+    stats_output_path = base_path + "stats_dict/"
+    mkdir_path(stats_output_path)
+    statsdata(data_file_path, stats_output_path, data_stats)
+    print("----------" * 10)
+    data_stats.load_dict(dict_path=stats_output_path, prefix="")
+    data_stats.get_cat2id(threshold=100)
+    # step 2, transform data trans2h5; version 2: np.random.shuffle
+    in_file_path = "./train.txt"
+    mkdir_path(args.output_path)
+    random_split_trans2h5(in_file_path, args.output_path, data_stats, part_rows=2000000, test_size=0.1, seed=2020)
--- a/tests/st/model_zoo_tests/DeepFM/src/init.py
+++ b/tests/st/model_zoo_tests/DeepFM/src/init.py
--- a/tests/st/model_zoo_tests/DeepFM/src/callback.py
+++ b/tests/st/model_zoo_tests/DeepFM/src/callback.py
@ -0,0 +1,110 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# httpwww.apache.orglicensesLICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Defined callback for DeepFM.
+"""
+import time
+from mindspore.train.callback import Callback
+
+
+def add_write(file_path, out_str):
+    with open(file_path, 'a+', encoding='utf-8') as file_out:
+        file_out.write(out_str + '\n')
+
+
+class EvalCallBack(Callback):
+    """
+    Monitor the loss in training.
+    If the loss is NAN or INF terminating training.
+    Note
+        If per_print_times is 0 do not print loss.
+    """
+    def __init__(self, model, eval_dataset, auc_metric, eval_file_path):
+        super(EvalCallBack, self).__init__()
+        self.model = model
+        self.eval_dataset = eval_dataset
+        self.aucMetric = auc_metric
+        self.aucMetric.clear()
+        self.eval_file_path = eval_file_path
+
+    def epoch_end(self, run_context):
+        start_time = time.time()
+        out = self.model.eval(self.eval_dataset)
+        eval_time = int(time.time() - start_time)
+        time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        out_str = "{} EvalCallBack metric{}; eval_time{}s".format(
+            time_str, out.values(), eval_time)
+        print(out_str)
+        add_write(self.eval_file_path, out_str)
+
+
+class LossCallBack(Callback):
+    """
+    Monitor the loss in training.
+    If the loss is NAN or INF terminating training.
+    Note
+        If per_print_times is 0 do not print loss.
+    Args
+        loss_file_path (str) The file absolute path, to save as loss_file;
+        per_print_times (int) Print loss every times. Default 1.
+    """
+    def __init__(self, loss_file_path, per_print_times=1):
+        super(LossCallBack, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0.")
+        self.loss_file_path = loss_file_path
+        self._per_print_times = per_print_times
+        self.loss = 0
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        loss = cb_params.net_outputs.asnumpy()
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+        cur_num = cb_params.cur_step_num
+        if self._per_print_times != 0 and cur_num % self._per_print_times == 0:
+            with open(self.loss_file_path, "a+") as loss_file:
+                time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                loss_file.write("{} epoch: {} step: {}, loss is {}\n".format(
+                    time_str, cb_params.cur_epoch_num, cur_step_in_epoch, loss))
+            print("epoch: {} step: {}, loss is {}\n".format(
+                cb_params.cur_epoch_num, cur_step_in_epoch, loss))
+            self.loss = loss
+
+class TimeMonitor(Callback):
+    """
+    Time monitor for calculating cost of each epoch.
+    Args
+        data_size (int) step size of an epoch.
+    """
+    def __init__(self, data_size):
+        super(TimeMonitor, self).__init__()
+        self.data_size = data_size
+        self.per_step_time = 0
+
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        per_step_mseconds = epoch_mseconds / self.data_size
+        print("epoch time: {0}, per step time: {1}".format(epoch_mseconds, per_step_mseconds), flush=True)
+        self.per_step_time = per_step_mseconds
+
+    def step_begin(self, run_context):
+        self.step_time = time.time()
+
+    def step_end(self, run_context):
+        step_mseconds = (time.time() - self.step_time) * 1000
+        print(f"step time {step_mseconds}", flush=True)
--- a/tests/st/model_zoo_tests/DeepFM/src/config.py
+++ b/tests/st/model_zoo_tests/DeepFM/src/config.py
@ -0,0 +1,62 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py and eval.py
+"""
+
+
+class DataConfig:
+    """
+    Define parameters of dataset.
+    """
+    data_vocab_size = 184965
+    train_num_of_parts = 21
+    test_num_of_parts = 3
+    batch_size = 1000
+    data_field_size = 39
+    # dataset format, 1: mindrecord, 2: tfrecord, 3: h5
+    data_format = 3
+
+
+class ModelConfig:
+    """
+    Define parameters of model.
+    """
+    batch_size = DataConfig.batch_size
+    data_field_size = DataConfig.data_field_size
+    data_vocab_size = DataConfig.data_vocab_size
+    data_emb_dim = 80
+    deep_layer_args = [[400, 400, 512], "relu"]
+    init_args = [-0.01, 0.01]
+    weight_bias_init = ['normal', 'normal']
+    keep_prob = 0.9
+
+
+class TrainConfig:
+    """
+    Define parameters of training.
+    """
+    batch_size = DataConfig.batch_size
+    l2_coef = 1e-6
+    learning_rate = 1e-5
+    epsilon = 1e-8
+    loss_scale = 1024.0
+    train_epochs = 3
+    save_checkpoint = True
+    ckpt_file_name_prefix = "deepfm"
+    save_checkpoint_steps = 1
+    keep_checkpoint_max = 15
+    eval_callback = True
+    loss_callback = True
--- a/tests/st/model_zoo_tests/DeepFM/src/dataset.py
+++ b/tests/st/model_zoo_tests/DeepFM/src/dataset.py
--- a/tests/st/model_zoo_tests/DeepFM/src/deepfm.py
+++ b/tests/st/model_zoo_tests/DeepFM/src/deepfm.py
--- a/tests/st/model_zoo_tests/DeepFM/test_deepfm.py
+++ b/tests/st/model_zoo_tests/DeepFM/test_deepfm.py
@ -0,0 +1,80 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_criteo."""
+import os
+import pytest
+
+from mindspore import context
+from mindspore.train.model import Model
+from mindspore.common import set_seed
+
+from src.deepfm import ModelBuilder, AUCMetric
+from src.config import DataConfig, ModelConfig, TrainConfig
+from src.dataset import create_dataset, DataType
+from src.callback import EvalCallBack, LossCallBack, TimeMonitor
+
+set_seed(1)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_deepfm():
+    data_config = DataConfig()
+    train_config = TrainConfig()
+    device_id = int(os.getenv('DEVICE_ID'))
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id)
+    rank_size = None
+    rank_id = None
+
+    dataset_path = "/home/workspace/mindspore_dataset/criteo_data/criteo_h5/"
+    print("dataset_path:", dataset_path)
+    ds_train = create_dataset(dataset_path,
+                              train_mode=True,
+                              epochs=1,
+                              batch_size=train_config.batch_size,
+                              data_type=DataType(data_config.data_format),
+                              rank_size=rank_size,
+                              rank_id=rank_id)
+
+    model_builder = ModelBuilder(ModelConfig, TrainConfig)
+    train_net, eval_net = model_builder.get_train_eval_net()
+    auc_metric = AUCMetric()
+    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
+
+    loss_file_name = './loss.log'
+    time_callback = TimeMonitor(data_size=ds_train.get_dataset_size())
+    loss_callback = LossCallBack(loss_file_path=loss_file_name)
+    callback_list = [time_callback, loss_callback]
+
+    eval_file_name = './auc.log'
+    ds_eval = create_dataset(dataset_path, train_mode=False,
+                             epochs=1,
+                             batch_size=train_config.batch_size,
+                             data_type=DataType(data_config.data_format))
+    eval_callback = EvalCallBack(model, ds_eval, auc_metric,
+                                 eval_file_path=eval_file_name)
+    callback_list.append(eval_callback)
+
+    print("train_config.train_epochs:", train_config.train_epochs)
+    model.train(train_config.train_epochs, ds_train, callbacks=callback_list)
+
+    export_loss_value = 0.51
+    print("loss_callback.loss:", loss_callback.loss)
+    assert loss_callback.loss < export_loss_value
+    export_per_step_time = 10.4
+    print("time_callback:", time_callback.per_step_time)
+    assert time_callback.per_step_time < export_per_step_time
+    print("*******test case pass!********")