diff --git a/model_zoo/official/recommend/wide_and_deep/src/datasets.py b/model_zoo/official/recommend/wide_and_deep/src/datasets.py index 96294d1618..a44717ed52 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/datasets.py +++ b/model_zoo/official/recommend/wide_and_deep/src/datasets.py @@ -230,7 +230,6 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column), input_columns=['feat_ids', 'feat_vals', 'label'], column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) - # if train_mode: ds = ds.repeat(epochs) return ds diff --git a/model_zoo/official/recommend/wide_and_deep/src/process_data.py b/model_zoo/official/recommend/wide_and_deep/src/process_data.py index f8a2362149..705692e96f 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/process_data.py +++ b/model_zoo/official/recommend/wide_and_deep/src/process_data.py @@ -33,17 +33,17 @@ class RecommendationDatasetStatsDict(): self.field_size = 39 # value_1-13; cat_1-26; self.val_cols = ["val_{}".format(i+1) for i in range(13)] self.cat_cols = ["cat_{}".format(i+1) for i in range(26)] - # + self.val_min_dict = {col: 0 for col in self.val_cols} self.val_max_dict = {col: 0 for col in self.val_cols} self.cat_count_dict = {col: collections.defaultdict(int) for col in self.cat_cols} - # + self.oov_prefix = "OOV_" self.cat2id_dict = {} self.cat2id_dict.update({col: i for i, col in enumerate(self.val_cols)}) self.cat2id_dict.update({self.oov_prefix + col: i + len(self.val_cols) for i, col in enumerate(self.cat_cols)}) - # + def stats_vals(self, val_list): """vals status""" assert len(val_list) == len(self.val_cols) @@ -54,19 +54,19 @@ class RecommendationDatasetStatsDict(): self.val_max_dict[key] = float(val) if float(val) < self.val_min_dict[key]: self.val_min_dict[key] = float(val) - # + for i, val in enumerate(val_list): map_max_min(i, val) - # + def stats_cats(self, cat_list): assert len(cat_list) == len(self.cat_cols) def map_cat_count(i, cat): key = self.cat_cols[i] self.cat_count_dict[key][cat] += 1 - # + for i, cat in enumerate(cat_list): map_cat_count(i, cat) - # + def save_dict(self, output_path, prefix=""): with open(os.path.join(output_path, "{}val_max_dict.pkl".format(prefix)), "wb") as file_wrt: pickle.dump(self.val_max_dict, file_wrt) @@ -74,7 +74,7 @@ class RecommendationDatasetStatsDict(): pickle.dump(self.val_min_dict, file_wrt) with open(os.path.join(output_path, "{}cat_count_dict.pkl".format(prefix)), "wb") as file_wrt: pickle.dump(self.cat_count_dict, file_wrt) - # + def load_dict(self, dict_path, prefix=""): with open(os.path.join(dict_path, "{}val_max_dict.pkl".format(prefix)), "rb") as file_wrt: self.val_max_dict = pickle.load(file_wrt) @@ -84,27 +84,20 @@ class RecommendationDatasetStatsDict(): self.cat_count_dict = pickle.load(file_wrt) print("val_max_dict.items()[:50]: {}".format(list(self.val_max_dict.items()))) print("val_min_dict.items()[:50]: {}".format(list(self.val_min_dict.items()))) - # - # + def get_cat2id(self, threshold=100): """get cat to id""" - # before_all_count = 0 - # after_all_count = 0 for key, cat_count_d in self.cat_count_dict.items(): new_cat_count_d = dict(filter(lambda x: x[1] > threshold, cat_count_d.items())) for cat_str, _ in new_cat_count_d.items(): self.cat2id_dict[key + "_" + cat_str] = len(self.cat2id_dict) - # print("before_all_count: {}".format(before_all_count)) # before_all_count: 33762577 - # print("after_all_count: {}".format(after_all_count)) # after_all_count: 184926 print("cat2id_dict.size: {}".format(len(self.cat2id_dict))) print("cat2id_dict.items()[:50]: {}".format(self.cat2id_dict.items()[:50])) - # + def map_cat2id(self, values, cats): """map cat to id""" def minmax_sclae_value(i, val): - # min_v = float(self.val_min_dict["val_{}".format(i+1)]) max_v = float(self.val_max_dict["val_{}".format(i + 1)]) - # return (float(val) - min_v) * 1.0 / (max_v - min_v) return float(val) * 1.0 / max_v id_list = [] @@ -117,7 +110,7 @@ class RecommendationDatasetStatsDict(): key = "val_{}".format(i + 1) id_list.append(self.cat2id_dict[key]) weight_list.append(minmax_sclae_value(i, float(val))) - # + for i, cat_str in enumerate(cats): key = "cat_{}".format(i + 1) + "_" + cat_str if key in self.cat2id_dict: @@ -126,14 +119,12 @@ class RecommendationDatasetStatsDict(): id_list.append(self.cat2id_dict[self.oov_prefix + "cat_{}".format(i + 1)]) weight_list.append(1.0) return id_list, weight_list - # - def mkdir_path(file_path): if not os.path.exists(file_path): os.makedirs(file_path) - # + def statsdata(data_file_path, output_path, recommendation_dataset_stats): """data status""" @@ -150,9 +141,7 @@ def statsdata(data_file_path, output_path, recommendation_dataset_stats): continue if count % 1000000 == 0: print("Have handle {}w lines.".format(count//10000)) - # if count % 5000000 == 0: - # print("Have handle {}w lines.".format(count//10000)) - # label = items[0] + values = items[1:14] cats = items[14:] assert len(values) == 13, "value.size: {}".format(len(values)) @@ -160,25 +149,21 @@ def statsdata(data_file_path, output_path, recommendation_dataset_stats): recommendation_dataset_stats.stats_vals(values) recommendation_dataset_stats.stats_cats(cats) recommendation_dataset_stats.save_dict(output_path) - # def add_write(file_path, wr_str): with open(file_path, "a", encoding="utf-8") as file_out: file_out.write(wr_str + "\n") -# def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stats, part_rows=2000000, test_size=0.1, seed=2020): """random split trans2h5""" test_size = int(TRAIN_LINE_COUNT * test_size) - # train_size = TRAIN_LINE_COUNT - test_size all_indices = [i for i in range(TRAIN_LINE_COUNT)] np.random.seed(seed) np.random.shuffle(all_indices) print("all_indices.size: {}".format(len(all_indices))) - # lines_count_dict = collections.defaultdict(int) test_indices_set = set(all_indices[:test_size]) print("test_indices_set.size: {}".format(len(test_indices_set))) print("------" * 10 + "\n" * 2) @@ -231,7 +216,7 @@ def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stat test_feature_list = [] test_label_list = [] test_part_number += 1 - # + if train_label_list: pd.DataFrame(np.asarray(train_feature_list)).to_hdf(train_feature_file_name.format(train_part_number), key="fixed") @@ -242,7 +227,6 @@ def random_split_trans2h5(in_file_path, output_path, recommendation_dataset_stat key="fixed") pd.DataFrame(np.asarray(test_label_list)).to_hdf(test_label_file_name.format(test_part_number), key="fixed") -# diff --git a/model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py b/model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py index 1358f6f76b..519d65efea 100644 --- a/model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py +++ b/model_zoo/official/recommend/wide_and_deep_multitable/src/wide_and_deep.py @@ -156,7 +156,6 @@ class WideDeepModel(nn.Cell): emb64_multi_size = 20900 indicator_size = 16 deep_dim_list = [1024, 1024, 1024, 1024, 1024] - # deep_dropout=0.0 wide_reg_coef = [0.0, 0.0] deep_reg_coef = [0.0, 0.0] wide_lr = 0.2 @@ -530,7 +529,6 @@ class TrainStepWrap(nn.Cell): initial_accum=0.1, loss_scale=sens) - #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens) self.optimizer_d = Adam(self.weights_d, learning_rate=config.adam_lr, eps=1e-6, diff --git a/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py b/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py index f71886a55d..b7dc5919ea 100644 --- a/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py +++ b/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval.py @@ -90,7 +90,6 @@ def train_and_eval(config): eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please - # set save_checkpoint_steps=ds_train.get_dataset_size() ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', diff --git a/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py b/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py index c6262336b0..13443c9b56 100644 --- a/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py +++ b/model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py @@ -95,7 +95,6 @@ def train_and_eval(config): eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please - # set save_checkpoint_steps=ds_train.get_dataset_size() ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',