疑问
- rule和glorot初始化不搭配吧?
代码
参考tensorflow-DeepFM/DeepFM.py ```python “”” Tensorflow implementation of DeepFM [1] Reference: [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He. “””
import numpy as np import tensorflow as tf from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import roc_auc_score from time import time from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm from yellowfin import YFOptimizer
class DeepFM(BaseEstimator, TransformerMixin): def init(self, feature_size, field_size, embedding_size=8, dropout_fm=[1.0, 1.0], deep_layers=[32, 32], dropout_deep=[0.5, 0.5, 0.5], deep_layers_activation=tf.nn.relu, epoch=10, batch_size=256, learning_rate=0.001, optimizer_type=”adam”, batch_norm=0, batch_norm_decay=0.995, verbose=False, random_seed=2016, use_fm=True, use_deep=True, loss_type=”logloss”, eval_metric=roc_auc_score, l2_reg=0.0, greater_is_better=True): assert (use_fm or use_deep) assert loss_type in [“logloss”, “mse”], \ “loss_type can be either ‘logloss’ for classification task or ‘mse’ for regression task”
self.feature_size = feature_size # denote as M, size of the feature dictionaryself.field_size = field_size # denote as F, size of the feature fieldsself.embedding_size = embedding_size # denote as K, size of the feature embeddingself.dropout_fm = dropout_fmself.deep_layers = deep_layersself.dropout_deep = dropout_deepself.deep_layers_activation = deep_layers_activationself.use_fm = use_fmself.use_deep = use_deepself.l2_reg = l2_regself.epoch = epochself.batch_size = batch_sizeself.learning_rate = learning_rateself.optimizer_type = optimizer_typeself.batch_norm = batch_normself.batch_norm_decay = batch_norm_decayself.verbose = verboseself.random_seed = random_seedself.loss_type = loss_typeself.eval_metric = eval_metricself.greater_is_better = greater_is_betterself.train_result, self.valid_result = [], []self._init_graph()def _init_graph(self):self.graph = tf.Graph()with self.graph.as_default():tf.set_random_seed(self.random_seed)self.feat_index = tf.placeholder(tf.int32, shape=[None, None],name="feat_index") # None * Fself.feat_value = tf.placeholder(tf.float32, shape=[None, None],name="feat_value") # None * Fself.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm")self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep")self.train_phase = tf.placeholder(tf.bool, name="train_phase")self.weights = self._initialize_weights()# modelself.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],self.feat_index) # None * F * Kfeat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])self.embeddings = tf.multiply(self.embeddings, feat_value)# ---------- first order term ----------self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * Fself.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F# ---------- second order term ---------------# sum_square partself.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * Kself.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K# square_sum partself.squared_features_emb = tf.square(self.embeddings)self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K# second orderself.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * Kself.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K# ---------- Deep component ----------self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])for i in range(0, len(self.deep_layers)):self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1if self.batch_norm:self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1self.y_deep = self.deep_layers_activation(self.y_deep)self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer# ---------- DeepFM ----------if self.use_fm and self.use_deep:concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)elif self.use_fm:concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)elif self.use_deep:concat_input = self.y_deepself.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"])# lossif self.loss_type == "logloss":self.out = tf.nn.sigmoid(self.out)self.loss = tf.losses.log_loss(self.label, self.out)elif self.loss_type == "mse":self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))# l2 regularization on weightsif self.l2_reg > 0:self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights["concat_projection"])if self.use_deep:for i in range(len(self.deep_layers)):self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(self.weights["layer_%d"%i])# optimizerif self.optimizer_type == "adam":self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,epsilon=1e-8).minimize(self.loss)elif self.optimizer_type == "adagrad":self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,initial_accumulator_value=1e-8).minimize(self.loss)elif self.optimizer_type == "gd":self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)elif self.optimizer_type == "momentum":self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)elif self.optimizer_type == "yellowfin":self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(self.loss)# initself.saver = tf.train.Saver()init = tf.global_variables_initializer()self.sess = self._init_session()self.sess.run(init)# number of paramstotal_parameters = 0for variable in self.weights.values():shape = variable.get_shape()variable_parameters = 1for dim in shape:variable_parameters *= dim.valuetotal_parameters += variable_parametersif self.verbose > 0:print("#params: %d" % total_parameters)def _init_session(self):config = tf.ConfigProto(device_count={"gpu": 0})config.gpu_options.allow_growth = Truereturn tf.Session(config=config)def _initialize_weights(self):weights = dict()# embeddingsweights["feature_embeddings"] = tf.Variable(tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),name="feature_embeddings") # feature_size * Kweights["feature_bias"] = tf.Variable(tf.random_uniform([self.feature_size, 1], 0.0, 1.0), name="feature_bias") # feature_size * 1# deep layersnum_layer = len(self.deep_layers)input_size = self.field_size * self.embedding_sizeglorot = np.sqrt(2.0 / (input_size + self.deep_layers[0]))weights["layer_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, self.deep_layers[0])), dtype=np.float32)weights["bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])),dtype=np.float32) # 1 * layers[0]for i in range(1, num_layer):glorot = np.sqrt(2.0 / (self.deep_layers[i-1] + self.deep_layers[i]))weights["layer_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i-1], self.deep_layers[i])),dtype=np.float32) # layers[i-1] * layers[i]weights["bias_%d" % i] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),dtype=np.float32) # 1 * layer[i]# final concat projection layerif self.use_fm and self.use_deep:input_size = self.field_size + self.embedding_size + self.deep_layers[-1]elif self.use_fm:input_size = self.field_size + self.embedding_sizeelif self.use_deep:input_size = self.deep_layers[-1]glorot = np.sqrt(2.0 / (input_size + 1))weights["concat_projection"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),dtype=np.float32) # layers[i-1]*layers[i]weights["concat_bias"] = tf.Variable(tf.constant(0.01), dtype=np.float32)return weightsdef batch_norm_layer(self, x, train_phase, scope_bn):bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=True, reuse=None, trainable=True, scope=scope_bn)bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=False, reuse=True, trainable=True, scope=scope_bn)z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)return zdef get_batch(self, Xi, Xv, y, batch_size, index):start = index * batch_sizeend = (index+1) * batch_sizeend = end if end < len(y) else len(y)return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]]# shuffle three lists simutaneouslydef shuffle_in_unison_scary(self, a, b, c):rng_state = np.random.get_state()np.random.shuffle(a)np.random.set_state(rng_state)np.random.shuffle(b)np.random.set_state(rng_state)np.random.shuffle(c)def fit_on_batch(self, Xi, Xv, y):feed_dict = {self.feat_index: Xi,self.feat_value: Xv,self.label: y,self.dropout_keep_fm: self.dropout_fm,self.dropout_keep_deep: self.dropout_deep,self.train_phase: True}loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)return lossdef fit(self, Xi_train, Xv_train, y_train,Xi_valid=None, Xv_valid=None, y_valid=None,early_stopping=False, refit=False):""":param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]indi_j is the feature index of feature field j of sample i in the training set:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]vali_j is the feature value of feature field j of sample i in the training setvali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features):param y_train: label of each sample in the training set:param Xi_valid: list of list of feature indices of each sample in the validation set:param Xv_valid: list of list of feature values of each sample in the validation set:param y_valid: label of each sample in the validation set:param early_stopping: perform early stopping or not:param refit: refit the model on the train+valid dataset or not:return: None"""has_valid = Xv_valid is not Nonefor epoch in range(self.epoch):t1 = time()self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)total_batch = int(len(y_train) / self.batch_size)for i in range(total_batch):Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)self.fit_on_batch(Xi_batch, Xv_batch, y_batch)# evaluate training and validation datasetstrain_result = self.evaluate(Xi_train, Xv_train, y_train)self.train_result.append(train_result)if has_valid:valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)self.valid_result.append(valid_result)if self.verbose > 0 and epoch % self.verbose == 0:if has_valid:print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]"% (epoch + 1, train_result, valid_result, time() - t1))else:print("[%d] train-result=%.4f [%.1f s]"% (epoch + 1, train_result, time() - t1))if has_valid and early_stopping and self.training_termination(self.valid_result):break# fit a few more epoch on train+valid until result reaches the best_train_scoreif has_valid and refit:if self.greater_is_better:best_valid_score = max(self.valid_result)else:best_valid_score = min(self.valid_result)best_epoch = self.valid_result.index(best_valid_score)best_train_score = self.train_result[best_epoch]Xi_train = Xi_train + Xi_validXv_train = Xv_train + Xv_validy_train = y_train + y_validfor epoch in range(100):self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)total_batch = int(len(y_train) / self.batch_size)for i in range(total_batch):Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train,self.batch_size, i)self.fit_on_batch(Xi_batch, Xv_batch, y_batch)# checktrain_result = self.evaluate(Xi_train, Xv_train, y_train)if abs(train_result - best_train_score) < 0.001 or \(self.greater_is_better and train_result > best_train_score) or \((not self.greater_is_better) and train_result < best_train_score):breakdef training_termination(self, valid_result):if len(valid_result) > 5:if self.greater_is_better:if valid_result[-1] < valid_result[-2] and \valid_result[-2] < valid_result[-3] and \valid_result[-3] < valid_result[-4] and \valid_result[-4] < valid_result[-5]:return Trueelse:if valid_result[-1] > valid_result[-2] and \valid_result[-2] > valid_result[-3] and \valid_result[-3] > valid_result[-4] and \valid_result[-4] > valid_result[-5]:return Truereturn Falsedef predict(self, Xi, Xv):""":param Xi: list of list of feature indices of each sample in the dataset:param Xv: list of list of feature values of each sample in the dataset:return: predicted probability of each sample"""# dummy ydummy_y = [1] * len(Xi)batch_index = 0Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)y_pred = Nonewhile len(Xi_batch) > 0:num_batch = len(y_batch)feed_dict = {self.feat_index: Xi_batch,self.feat_value: Xv_batch,self.label: y_batch,self.dropout_keep_fm: [1.0] * len(self.dropout_fm),self.dropout_keep_deep: [1.0] * len(self.dropout_deep),self.train_phase: False}batch_out = self.sess.run(self.out, feed_dict=feed_dict)if batch_index == 0:y_pred = np.reshape(batch_out, (num_batch,))else:y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))batch_index += 1Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)return y_preddef evaluate(self, Xi, Xv, y):""":param Xi: list of list of feature indices of each sample in the dataset:param Xv: list of list of feature values of each sample in the dataset:param y: label of each sample in the dataset:return: metric of the evaluation"""y_pred = self.predict(Xi, Xv)return self.eval_metric(y, y_pred)
```
