疑问

  1. rule和glorot初始化不搭配吧?

    代码

    参考tensorflow-DeepFM/DeepFM.py ```python “”” Tensorflow implementation of DeepFM [1] Reference: [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He. “””

import numpy as np import tensorflow as tf from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import roc_auc_score from time import time from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm from yellowfin import YFOptimizer

class DeepFM(BaseEstimator, TransformerMixin): def init(self, feature_size, field_size, embedding_size=8, dropout_fm=[1.0, 1.0], deep_layers=[32, 32], dropout_deep=[0.5, 0.5, 0.5], deep_layers_activation=tf.nn.relu, epoch=10, batch_size=256, learning_rate=0.001, optimizer_type=”adam”, batch_norm=0, batch_norm_decay=0.995, verbose=False, random_seed=2016, use_fm=True, use_deep=True, loss_type=”logloss”, eval_metric=roc_auc_score, l2_reg=0.0, greater_is_better=True): assert (use_fm or use_deep) assert loss_type in [“logloss”, “mse”], \ “loss_type can be either ‘logloss’ for classification task or ‘mse’ for regression task”

  1. self.feature_size = feature_size # denote as M, size of the feature dictionary
  2. self.field_size = field_size # denote as F, size of the feature fields
  3. self.embedding_size = embedding_size # denote as K, size of the feature embedding
  4. self.dropout_fm = dropout_fm
  5. self.deep_layers = deep_layers
  6. self.dropout_deep = dropout_deep
  7. self.deep_layers_activation = deep_layers_activation
  8. self.use_fm = use_fm
  9. self.use_deep = use_deep
  10. self.l2_reg = l2_reg
  11. self.epoch = epoch
  12. self.batch_size = batch_size
  13. self.learning_rate = learning_rate
  14. self.optimizer_type = optimizer_type
  15. self.batch_norm = batch_norm
  16. self.batch_norm_decay = batch_norm_decay
  17. self.verbose = verbose
  18. self.random_seed = random_seed
  19. self.loss_type = loss_type
  20. self.eval_metric = eval_metric
  21. self.greater_is_better = greater_is_better
  22. self.train_result, self.valid_result = [], []
  23. self._init_graph()
  24. def _init_graph(self):
  25. self.graph = tf.Graph()
  26. with self.graph.as_default():
  27. tf.set_random_seed(self.random_seed)
  28. self.feat_index = tf.placeholder(tf.int32, shape=[None, None],
  29. name="feat_index") # None * F
  30. self.feat_value = tf.placeholder(tf.float32, shape=[None, None],
  31. name="feat_value") # None * F
  32. self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1
  33. self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm")
  34. self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep")
  35. self.train_phase = tf.placeholder(tf.bool, name="train_phase")
  36. self.weights = self._initialize_weights()
  37. # model
  38. self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],
  39. self.feat_index) # None * F * K
  40. feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
  41. self.embeddings = tf.multiply(self.embeddings, feat_value)
  42. # ---------- first order term ----------
  43. self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1
  44. self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * F
  45. self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F
  46. # ---------- second order term ---------------
  47. # sum_square part
  48. self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K
  49. self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
  50. # square_sum part
  51. self.squared_features_emb = tf.square(self.embeddings)
  52. self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K
  53. # second order
  54. self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K
  55. self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K
  56. # ---------- Deep component ----------
  57. self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)
  58. self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
  59. for i in range(0, len(self.deep_layers)):
  60. self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1
  61. if self.batch_norm:
  62. self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
  63. self.y_deep = self.deep_layers_activation(self.y_deep)
  64. self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer
  65. # ---------- DeepFM ----------
  66. if self.use_fm and self.use_deep:
  67. concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
  68. elif self.use_fm:
  69. concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
  70. elif self.use_deep:
  71. concat_input = self.y_deep
  72. self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"])
  73. # loss
  74. if self.loss_type == "logloss":
  75. self.out = tf.nn.sigmoid(self.out)
  76. self.loss = tf.losses.log_loss(self.label, self.out)
  77. elif self.loss_type == "mse":
  78. self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
  79. # l2 regularization on weights
  80. if self.l2_reg > 0:
  81. self.loss += tf.contrib.layers.l2_regularizer(
  82. self.l2_reg)(self.weights["concat_projection"])
  83. if self.use_deep:
  84. for i in range(len(self.deep_layers)):
  85. self.loss += tf.contrib.layers.l2_regularizer(
  86. self.l2_reg)(self.weights["layer_%d"%i])
  87. # optimizer
  88. if self.optimizer_type == "adam":
  89. self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
  90. epsilon=1e-8).minimize(self.loss)
  91. elif self.optimizer_type == "adagrad":
  92. self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
  93. initial_accumulator_value=1e-8).minimize(self.loss)
  94. elif self.optimizer_type == "gd":
  95. self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
  96. elif self.optimizer_type == "momentum":
  97. self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
  98. self.loss)
  99. elif self.optimizer_type == "yellowfin":
  100. self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(
  101. self.loss)
  102. # init
  103. self.saver = tf.train.Saver()
  104. init = tf.global_variables_initializer()
  105. self.sess = self._init_session()
  106. self.sess.run(init)
  107. # number of params
  108. total_parameters = 0
  109. for variable in self.weights.values():
  110. shape = variable.get_shape()
  111. variable_parameters = 1
  112. for dim in shape:
  113. variable_parameters *= dim.value
  114. total_parameters += variable_parameters
  115. if self.verbose > 0:
  116. print("#params: %d" % total_parameters)
  117. def _init_session(self):
  118. config = tf.ConfigProto(device_count={"gpu": 0})
  119. config.gpu_options.allow_growth = True
  120. return tf.Session(config=config)
  121. def _initialize_weights(self):
  122. weights = dict()
  123. # embeddings
  124. weights["feature_embeddings"] = tf.Variable(
  125. tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),
  126. name="feature_embeddings") # feature_size * K
  127. weights["feature_bias"] = tf.Variable(
  128. tf.random_uniform([self.feature_size, 1], 0.0, 1.0), name="feature_bias") # feature_size * 1
  129. # deep layers
  130. num_layer = len(self.deep_layers)
  131. input_size = self.field_size * self.embedding_size
  132. glorot = np.sqrt(2.0 / (input_size + self.deep_layers[0]))
  133. weights["layer_0"] = tf.Variable(
  134. np.random.normal(loc=0, scale=glorot, size=(input_size, self.deep_layers[0])), dtype=np.float32)
  135. weights["bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])),
  136. dtype=np.float32) # 1 * layers[0]
  137. for i in range(1, num_layer):
  138. glorot = np.sqrt(2.0 / (self.deep_layers[i-1] + self.deep_layers[i]))
  139. weights["layer_%d" % i] = tf.Variable(
  140. np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i-1], self.deep_layers[i])),
  141. dtype=np.float32) # layers[i-1] * layers[i]
  142. weights["bias_%d" % i] = tf.Variable(
  143. np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
  144. dtype=np.float32) # 1 * layer[i]
  145. # final concat projection layer
  146. if self.use_fm and self.use_deep:
  147. input_size = self.field_size + self.embedding_size + self.deep_layers[-1]
  148. elif self.use_fm:
  149. input_size = self.field_size + self.embedding_size
  150. elif self.use_deep:
  151. input_size = self.deep_layers[-1]
  152. glorot = np.sqrt(2.0 / (input_size + 1))
  153. weights["concat_projection"] = tf.Variable(
  154. np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),
  155. dtype=np.float32) # layers[i-1]*layers[i]
  156. weights["concat_bias"] = tf.Variable(tf.constant(0.01), dtype=np.float32)
  157. return weights
  158. def batch_norm_layer(self, x, train_phase, scope_bn):
  159. bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
  160. is_training=True, reuse=None, trainable=True, scope=scope_bn)
  161. bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,
  162. is_training=False, reuse=True, trainable=True, scope=scope_bn)
  163. z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
  164. return z
  165. def get_batch(self, Xi, Xv, y, batch_size, index):
  166. start = index * batch_size
  167. end = (index+1) * batch_size
  168. end = end if end < len(y) else len(y)
  169. return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]]
  170. # shuffle three lists simutaneously
  171. def shuffle_in_unison_scary(self, a, b, c):
  172. rng_state = np.random.get_state()
  173. np.random.shuffle(a)
  174. np.random.set_state(rng_state)
  175. np.random.shuffle(b)
  176. np.random.set_state(rng_state)
  177. np.random.shuffle(c)
  178. def fit_on_batch(self, Xi, Xv, y):
  179. feed_dict = {self.feat_index: Xi,
  180. self.feat_value: Xv,
  181. self.label: y,
  182. self.dropout_keep_fm: self.dropout_fm,
  183. self.dropout_keep_deep: self.dropout_deep,
  184. self.train_phase: True}
  185. loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
  186. return loss
  187. def fit(self, Xi_train, Xv_train, y_train,
  188. Xi_valid=None, Xv_valid=None, y_valid=None,
  189. early_stopping=False, refit=False):
  190. """
  191. :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
  192. indi_j is the feature index of feature field j of sample i in the training set
  193. :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
  194. vali_j is the feature value of feature field j of sample i in the training set
  195. vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
  196. :param y_train: label of each sample in the training set
  197. :param Xi_valid: list of list of feature indices of each sample in the validation set
  198. :param Xv_valid: list of list of feature values of each sample in the validation set
  199. :param y_valid: label of each sample in the validation set
  200. :param early_stopping: perform early stopping or not
  201. :param refit: refit the model on the train+valid dataset or not
  202. :return: None
  203. """
  204. has_valid = Xv_valid is not None
  205. for epoch in range(self.epoch):
  206. t1 = time()
  207. self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
  208. total_batch = int(len(y_train) / self.batch_size)
  209. for i in range(total_batch):
  210. Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)
  211. self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
  212. # evaluate training and validation datasets
  213. train_result = self.evaluate(Xi_train, Xv_train, y_train)
  214. self.train_result.append(train_result)
  215. if has_valid:
  216. valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)
  217. self.valid_result.append(valid_result)
  218. if self.verbose > 0 and epoch % self.verbose == 0:
  219. if has_valid:
  220. print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]"
  221. % (epoch + 1, train_result, valid_result, time() - t1))
  222. else:
  223. print("[%d] train-result=%.4f [%.1f s]"
  224. % (epoch + 1, train_result, time() - t1))
  225. if has_valid and early_stopping and self.training_termination(self.valid_result):
  226. break
  227. # fit a few more epoch on train+valid until result reaches the best_train_score
  228. if has_valid and refit:
  229. if self.greater_is_better:
  230. best_valid_score = max(self.valid_result)
  231. else:
  232. best_valid_score = min(self.valid_result)
  233. best_epoch = self.valid_result.index(best_valid_score)
  234. best_train_score = self.train_result[best_epoch]
  235. Xi_train = Xi_train + Xi_valid
  236. Xv_train = Xv_train + Xv_valid
  237. y_train = y_train + y_valid
  238. for epoch in range(100):
  239. self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
  240. total_batch = int(len(y_train) / self.batch_size)
  241. for i in range(total_batch):
  242. Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train,
  243. self.batch_size, i)
  244. self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
  245. # check
  246. train_result = self.evaluate(Xi_train, Xv_train, y_train)
  247. if abs(train_result - best_train_score) < 0.001 or \
  248. (self.greater_is_better and train_result > best_train_score) or \
  249. ((not self.greater_is_better) and train_result < best_train_score):
  250. break
  251. def training_termination(self, valid_result):
  252. if len(valid_result) > 5:
  253. if self.greater_is_better:
  254. if valid_result[-1] < valid_result[-2] and \
  255. valid_result[-2] < valid_result[-3] and \
  256. valid_result[-3] < valid_result[-4] and \
  257. valid_result[-4] < valid_result[-5]:
  258. return True
  259. else:
  260. if valid_result[-1] > valid_result[-2] and \
  261. valid_result[-2] > valid_result[-3] and \
  262. valid_result[-3] > valid_result[-4] and \
  263. valid_result[-4] > valid_result[-5]:
  264. return True
  265. return False
  266. def predict(self, Xi, Xv):
  267. """
  268. :param Xi: list of list of feature indices of each sample in the dataset
  269. :param Xv: list of list of feature values of each sample in the dataset
  270. :return: predicted probability of each sample
  271. """
  272. # dummy y
  273. dummy_y = [1] * len(Xi)
  274. batch_index = 0
  275. Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
  276. y_pred = None
  277. while len(Xi_batch) > 0:
  278. num_batch = len(y_batch)
  279. feed_dict = {self.feat_index: Xi_batch,
  280. self.feat_value: Xv_batch,
  281. self.label: y_batch,
  282. self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
  283. self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
  284. self.train_phase: False}
  285. batch_out = self.sess.run(self.out, feed_dict=feed_dict)
  286. if batch_index == 0:
  287. y_pred = np.reshape(batch_out, (num_batch,))
  288. else:
  289. y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))
  290. batch_index += 1
  291. Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)
  292. return y_pred
  293. def evaluate(self, Xi, Xv, y):
  294. """
  295. :param Xi: list of list of feature indices of each sample in the dataset
  296. :param Xv: list of list of feature values of each sample in the dataset
  297. :param y: label of each sample in the dataset
  298. :return: metric of the evaluation
  299. """
  300. y_pred = self.predict(Xi, Xv)
  301. return self.eval_metric(y, y_pred)

```