
  1. from setuptools import setup
  2. from setuptools import find_packages
  3. setup(name='pygcn',
  4. version='0.1',
  5. description='Graph Convolutional Networks in PyTorch',
  6. author='Thomas Kipf',
  7. author_email='',
  8. url='',
  9. download_url='',
  10. license='MIT',
  11. install_requires=['numpy',
  12. 'torch',
  13. 'scipy'
  14. ],
  15. package_data={'pygcn': ['']},
  16. packages=find_packages())

  1. import numpy as np
  2. import scipy.sparse as sp
  3. import torch
  4. import as sio
  5. import random
  6. from sklearn import preprocessing
  7. def encode_onehot(labels):
  8. classes = set(labels) # set() 函数创建一个无序不重复元素集
  9. # enumerate()函数生成序列,带有索引i和值c。
  10. # 这一句将string类型的label变为onehot编码的label,建立映射关系
  11. classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
  12. enumerate(classes)}
  13. # map() 会根据提供的函数对指定序列做映射。
  14. # 这一句将string类型的label替换为onehot编码的label
  15. labels_onehot = np.array(list(map(classes_dict.get, labels)),
  16. dtype=np.int32)
  17. # 返回int类型的label
  18. return labels_onehot
  19. '''数据读取'''
  20. # 更改路径。由../改为C:\Users\73416\PycharmProjects\PyGCN
  21. def load_data(path="C:/Users/73416/PycharmProjects/PyGCN_Visualization/data/cora/", dataset="cora"):
  22. """Load citation network dataset (cora only for now)"""
  23. print('Loading {} dataset...'.format(dataset))
  24. '''
  25. cora.content 介绍:
  26. cora.content共有2708行,每一行代表一个样本点,即一篇论文。
  27. 每一行由三部分组成:
  28. 是论文的编号,如31336;
  29. 论文的词向量,一个有1433位的二进制;
  30. 论文的类别,如Neural_Networks。总共7种类别(label)
  31. 第一个是论文编号,最后一个是论文类别,中间是自己的信息(feature)
  32. '''
  33. '''读取feature和label'''
  34. # 以字符串形式读取数据集文件:各自的信息。
  35. idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
  36. dtype=np.dtype(str))
  37. # csr_matrix:Compressed Sparse Row marix,稀疏np.array的压缩
  38. # idx_features_labels[:, 1:-1]表明跳过论文编号和论文类别,只取自己的信息(feature of node)
  39. features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
  40. # idx_features_labels[:, -1]表示只取最后一个,即论文类别,得到的返回值为int类型的label
  41. labels = encode_onehot(idx_features_labels[:, -1])
  42. # build graph
  43. # idx_features_labelsidx_features_labels[:, 0]表示取论文编号
  44. idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
  45. # 通过建立论文序号的序列,得到论文序号的字典
  46. idx_map = {j: i for i, j in enumerate(idx)}
  47. edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
  48. dtype=np.int32)
  49. # 进行一次论文序号的映射
  50. # 论文编号没有用,需要重新的其进行编号(从0开始),然后对原编号进行替换。
  51. # 所以目的是把离散的原始的编号,变成0 - 2707的连续编号
  52. edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
  53. dtype=np.int32).reshape(edges_unordered.shape)
  54. # coo_matrix():系数矩阵的压缩。分别定义有那些非零元素,以及各个非零元素对应的row和col,最后定义稀疏矩阵的shape。
  55. adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
  56. shape=(labels.shape[0], labels.shape[0]),
  57. dtype=np.float32)
  58. # build symmetric adjacency matrix
  59. adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
  60. # feature和adj归一化
  61. features = normalize(features)
  62. adj = normalize(adj + sp.eye(adj.shape[0]))
  63. # train set, validation set, test set的分组。
  64. idx_train = range(140)
  65. idx_val = range(200, 500)
  66. idx_test = range(500, 1500)
  67. # 数据类型转tensor
  68. features = torch.FloatTensor(np.array(features.todense()))
  69. labels = torch.LongTensor(np.where(labels)[1])
  70. adj = sparse_mx_to_torch_sparse_tensor(adj)
  71. idx_train = torch.LongTensor(idx_train)
  72. idx_val = torch.LongTensor(idx_val)
  73. idx_test = torch.LongTensor(idx_test)
  74. # 返回数据
  75. return adj, features, labels, idx_train, idx_val, idx_test
  76. def load_data2(dataset_source):
  77. data = sio.loadmat("../data/{}.mat".format(dataset_source))
  78. features = data["Attributes"]
  79. adj = data["Network"]
  80. labels = data["Label"]
  81. nb_nodes = features.shape[0]
  82. ft_size = features.shape[1]
  83. lb = preprocessing.LabelBinarizer()
  84. labels = lb.fit_transform(labels)
  85. # adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
  86. # features = normalize(features)
  87. adj = normalize(adj + sp.eye(adj.shape[0]))
  88. # features = preprocessing.normalize(features, norm='l2', axis=0)
  89. node_perm = np.random.permutation(labels.shape[0])
  90. num_train = int(0.05 * adj.shape[0])
  91. num_val = int(0.1 * adj.shape[0])
  92. idx_train = node_perm[:num_train]
  93. idx_val = node_perm[num_train:num_train + num_val]
  94. idx_test = node_perm[num_train + num_val:]
  95. features = torch.FloatTensor(np.array(features.todense()))
  96. labels = torch.LongTensor(np.where(labels)[1])
  97. adj = sparse_mx_to_torch_sparse_tensor(adj)
  98. idx_train = torch.LongTensor(idx_train)
  99. idx_val = torch.LongTensor(idx_val)
  100. idx_test = torch.LongTensor(idx_test)
  101. return adj, features, labels, idx_train, idx_val, idx_test
  102. def normalize(mx):
  103. """Row-normalize sparse matrix"""
  104. rowsum = np.array(mx.sum(1)) # (2708, 1)
  105. r_inv = np.power(rowsum, -1).flatten() # (2708,)
  106. r_inv[np.isinf(r_inv)] = 0. # 处理除数为0导致的inf
  107. r_mat_inv = sp.diags(r_inv)
  108. mx =
  109. return mx
  110. def accuracy(output, labels):
  111. preds = output.max(1)[1].type_as(labels)
  112. correct = preds.eq(labels).double()
  113. correct = correct.sum()
  114. return correct / len(labels)
  115. def sparse_mx_to_torch_sparse_tensor(sparse_mx):
  116. """Convert a scipy sparse matrix to a torch sparse tensor."""
  117. sparse_mx = sparse_mx.tocoo().astype(np.float32)
  118. indices = torch.from_numpy(
  119. np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
  120. values = torch.from_numpy(
  121. shape = torch.Size(sparse_mx.shape)
  122. return torch.sparse.FloatTensor(indices, values, shape)


更多关于onehot编码的细节,参见博客:[数据预处理] onehot编码:是什么,为什么,怎么样

  1. [Genetic_Algorithms’, Probabilistic_Methods’, Reinforcement_Learning’, Neural_Networks’, Theory’, Case_Based’, Rule_Learning ]


  1. # 'Genetic_Algorithms': array([1., 0., 0., 0., 0., 0., 0.]),
  2. # 'Probabilistic_Methods': array([0., 1., 0., 0., 0., 0., 0.]),
  3. # 'Reinforcement_Learning': array([0., 0., 1., 0., 0., 0., 0.]),
  4. # 'Neural_Networks': array([0., 0., 0., 1., 0., 0., 0.]),
  5. # 'Theory': array([0., 0., 0., 0., 1., 0., 0.]),
  6. # 'Case_Based': array([0., 0., 0., 0., 0., 1., 0.]),
  7. # 'Rule_Learning': array([0., 0., 0., 0., 0., 0., 1.])}


  1. from pygcn.utils import encode_onehot
  2. import numpy as np
  3. '''labels的onehot编码,前后结果对比'''
  4. # 读取原始数据集
  5. path="../data/cora/"
  6. dataset = "cora"
  7. idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
  8. dtype=np.dtype(str))
  9. RawLabels=idx_features_labels[:, -1]
  10. print("原始论文类别(label):\n",RawLabels)
  11. # ['Neural_Networks' 'Rule_Learning' 'Reinforcement_Learning' ...
  12. # 'Genetic_Algorithms' 'Case_Based' 'Neural_Networks']
  13. print(len(RawLabels)) # 2708
  14. classes = set(RawLabels) # set() 函数创建一个无序不重复元素集
  15. print("原始标签的无序不重复元素集\n", classes)
  16. # {'Genetic_Algorithms', 'Probabilistic_Methods', 'Reinforcement_Learning', 'Neural_Networks', 'Theory', 'Case_Based', 'Rule_Learning'}
  17. # enumerate()函数生成序列,带有索引i和值c。
  18. # 这一句将string类型的label变为onehot编码的label,建立映射关系
  19. classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
  20. enumerate(classes)}
  21. print("原始标签与onehot编码结果的映射字典\n",classes_dict)
  22. # {'Genetic_Algorithms': array([1., 0., 0., 0., 0., 0., 0.]), 'Probabilistic_Methods': array([0., 1., 0., 0., 0., 0., 0.]),
  23. # 'Reinforcement_Learning': array([0., 0., 1., 0., 0., 0., 0.]), 'Neural_Networks': array([0., 0., 0., 1., 0., 0., 0.]),
  24. # 'Theory': array([0., 0., 0., 0., 1., 0., 0.]), 'Case_Based': array([0., 0., 0., 0., 0., 1., 0.]),
  25. # 'Rule_Learning': array([0., 0., 0., 0., 0., 0., 1.])}
  26. # map() 会根据提供的函数对指定序列做映射。
  27. # 这一句将string类型的label替换为onehot编码的label
  28. labels_onehot = np.array(list(map(classes_dict.get, RawLabels)),
  29. dtype=np.int32)
  30. print("onehot编码的论文类别(label):\n",labels_onehot)
  31. # [[0 0 0... 0 0 0]
  32. # [0 0 0... 1 0 0]
  33. # [0 1 0 ... 0 0 0]
  34. # ...
  35. # [0 0 0 ... 0 0 1]
  36. # [0 0 1 ... 0 0 0]
  37. # [0 0 0 ... 0 0 0]]
  38. print(labels_onehot.shape)
  39. # (2708, 7)


  1. def normalize(mx):
  2. """Row-normalize sparse matrix"""
  3. rowsum = np.array(mx.sum(1)) # (2708, 1)
  4. r_inv = np.power(rowsum, -1).flatten() # (2708,)
  5. r_inv[np.isinf(r_inv)] = 0. # 处理除数为0导致的inf
  6. r_mat_inv = sp.diags(r_inv)
  7. mx =
  8. return mx


  1. <2708x1433 sparse matrix of type '<class 'numpy.float32'>'
  2. with 49216 stored elements in Compressed Sparse Row format>


  1. <2708x1433 sparse matrix of type '<class 'numpy.float32'>'
  2. with 49216 stored elements in Compressed Sparse Row format>

实现方式:对mx每一行求和,取倒数之后的结果就是每一行非零元素 (即1) 归一化的数值,再与原mx作点乘(目的是将归一化数值替换掉原来的1,即将归一化数值与1相乘)。

  1. sample1_label=RawFeature[0,:]
  2. sumA=sample1_label.sum()

第一行归一化的结果为:第2期--基于Pytorch的Kipf的GCN算法实现 - 图1

归一化后的值正好是 1 / 20 = 0.05 。

  1. import numpy as np
  2. import scipy.sparse as sp
  3. from pygcn.utils import normalize
  4. '''测试归一化函数'''
  5. # 读取原始数据集
  6. path="../data/cora/"
  7. dataset = "cora"
  8. idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
  9. dtype=np.dtype(str))
  10. RawFeature = idx_features_labels[:, 1:-1]
  11. RawFeature=RawFeature.astype(int)
  12. sample1_label=RawFeature[0,:]
  13. sumA=sample1_label.sum()
  14. print("原始的feature\n",RawFeature)
  15. # type ndarray
  16. # [['0' '0' '0'... '0' '0' '0']
  17. # ['0' '0' '0'... '0' '0' '0']
  18. # ['0' '0' '0'...'0' '0' '0']
  19. # ...
  20. # ['0' '0' '0'...'0' '0' '0']
  21. # ['0' '0' '0'... '0' '0' '0']
  22. # ['0' '0' '0'...'0' '0' '0']]
  23. print(RawFeature.shape)
  24. # (2708, 1433)
  25. features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
  26. # <2708x1433 sparse matrix of type '<class 'numpy.float32'>'
  27. # with 49216 stored elements in Compressed Sparse Row format>
  28. print("csr_matrix之后的feature\n",features)
  29. # type csr_matrix
  30. # (0, 0) 0.0
  31. # (0, 1) 0.0
  32. # (0, 2) 0.0
  33. # (0, 3) 0.0
  34. # (0, 4) 0.0
  35. # ::
  36. # (2707, 1428) 0.0
  37. # (2707, 1429) 0.0
  38. # (2707, 1430) 0.0
  39. # (2707, 1431) 0.0
  40. # (2707, 1432) 0.0
  41. print(features.shape)
  42. # (2708, 1433)
  43. # features = normalize(features)
  44. rowsum = np.array(features.sum(1)) # (2708, 1)
  45. r_inv = np.power(rowsum, -1).flatten() # (2708,)
  46. r_inv[np.isinf(r_inv)] = 0. # 处理除数为0导致的inf
  47. r_mat_inv = sp.diags(r_inv)
  48. # <2708x2708 sparse matrix of type '<class 'numpy.float32'>'
  49. # with 2708 stored elements (1 diagonals) in DIAgonal format>
  50. mx =
  51. print('normalization之后的feature\n',mx)
  52. # (0, 176) 0.05
  53. # (0, 125) 0.05
  54. # (0, 118) 0.05
  55. # ::
  56. # (1, 1425) 0.05882353
  57. # (1, 1389) 0.05882353
  58. # (1, 1263) 0.05882353
  59. # ::
  60. # (2707, 136) 0.05263158
  61. # (2707, 67) 0.05263158
  62. # (2707, 19) 0.05263158

数据集读取函数:load_data(path, dataset)

  1. '''数据读取'''
  2. # 更改路径。由../改为C:\Users\73416\PycharmProjects\PyGCN
  3. def load_data(path="../data/cora/", dataset="cora"):
  4. """Load citation network dataset (cora only for now)"""
  5. print('Loading {} dataset...'.format(dataset))
  6. '''
  7. cora.content 介绍:
  8. cora.content共有2708行,每一行代表一个样本点,即一篇论文。
  9. 每一行由三部分组成:
  10. 是论文的编号,如31336;
  11. 论文的词向量,一个有1433位的二进制;
  12. 论文的类别,如Neural_Networks。总共7种类别(label)
  13. 第一个是论文编号,最后一个是论文类别,中间是自己的信息(feature)
  14. '''
  15. '''读取feature和label'''
  16. # 以字符串形式读取数据集文件:各自的信息。
  17. idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
  18. dtype=np.dtype(str))
  19. # csr_matrix:Compressed Sparse Row marix,稀疏np.array的压缩
  20. # idx_features_labels[:, 1:-1]表明跳过论文编号和论文类别,只取自己的信息(feature of node)
  21. features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
  22. # idx_features_labels[:, -1]表示只取最后一个,即论文类别,得到的返回值为int类型的label
  23. labels = encode_onehot(idx_features_labels[:, -1])
  24. # build graph
  25. # idx_features_labelsidx_features_labels[:, 0]表示取论文编号
  26. idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
  27. # 通过建立论文序号的序列,得到论文序号的字典
  28. idx_map = {j: i for i, j in enumerate(idx)}
  29. edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
  30. dtype=np.int32)
  31. # 进行一次论文序号的映射
  32. # 论文编号没有用,需要重新的其进行编号(从0开始),然后对原编号进行替换。
  33. # 所以目的是把离散的原始的编号,变成0 - 2707的连续编号
  34. edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
  35. dtype=np.int32).reshape(edges_unordered.shape)
  36. # coo_matrix():系数矩阵的压缩。分别定义有那些非零元素,以及各个非零元素对应的row和col,最后定义稀疏矩阵的shape。
  37. adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
  38. shape=(labels.shape[0], labels.shape[0]),
  39. dtype=np.float32)
  40. # build symmetric adjacency matrix
  41. adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
  42. # feature和adj归一化
  43. features = normalize(features)
  44. adj = normalize(adj + sp.eye(adj.shape[0]))
  45. # train set, validation set, test set的分组。
  46. idx_train = range(140)
  47. idx_val = range(200, 500)
  48. idx_test = range(500, 1500)
  49. # 数据类型转tensor
  50. features = torch.FloatTensor(np.array(features.todense()))
  51. labels = torch.LongTensor(np.where(labels)[1])
  52. adj = sparse_mx_to_torch_sparse_tensor(adj)
  53. idx_train = torch.LongTensor(idx_train)
  54. idx_val = torch.LongTensor(idx_val)
  55. idx_test = torch.LongTensor(idx_test)
  56. # 返回数据
  57. return adj, features, labels, idx_train, idx_val, idx_test


  • 第一列:各个样本的标号(论文编号)
  • 第二列-倒数第二列:各个样本的feature
  • 第三列:各个样本的label

第2期--基于Pytorch的Kipf的GCN算法实现 - 图2

labels 预处理

即对 labels 进行 onehot 编码。

  1. # idx_features_labels[:, -1]表示只取最后一个,即论文类别,得到的返回值为int类型的label
  2. labels = encode_onehot(idx_features_labels[:, -1])

feature 预处理

即对 feature 进行归一化。由于 feature 为维度较大的稀疏矩阵,故使用 scipy.sparse 来处理。

  1. # csr_matrix:Compressed Sparse Row marix,稀疏np.array的压缩
  2. # idx_features_labels[:, 1:-1]表明跳过论文编号和论文类别,只取自己的信息(feature of node)
  3. features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
  4. ……
  5. # feature和adj归一化
  6. features = normalize(features)

关于处理方法,已经在 特征归一化函数:normalize(mx) 写明了。

构建邻接矩阵 adj


  • 序号预处理:将非连续的离散序号,转化为连续的离散序号(0,1,2,……,2707)。
  • 根据预处理后的序号,将引用关系转化为邻接矩阵adj。

    1. # idx_features_labelsidx_features_labels[:, 0]表示取论文编号
    2. idx = np.array(idx_features_labels[:, 0], dtype=np.int32)


    1. # 通过建立论文序号的序列,得到论文序号的字典
    2. idx_map = {j: i for i, j in enumerate(idx)}


    1. # 读取图的边(论文间的引用关系)
    2. # cora.cites共5429行, 每一行有两个论文编号,表示第一个编号的论文先写,第二个编号的论文引用第一个编号的论文。
    3. edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
    4. dtype=np.int32)


    1. [[ 35 1033]
    2. [ 35 103482]
    3. [ 35 103515]
    4. ...
    5. [ 853118 1140289]
    6. [ 853155 853118]
    7. [ 954315 1155073]]


    1. # 进行一次论文序号的映射
    2. # 论文编号没有用,需要重新的其进行编号(从0开始),然后对原编号进行替换。
    3. # 所以目的是把离散的原始的编号,变成0 - 2707的连续编号
    4. edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
    5. dtype=np.int32).reshape(edges_unordered.shape)


    1. # coo_matrix():系数矩阵的压缩。分别定义有那些非零元素,以及各个非零元素对应的row和col,最后定义稀疏矩阵的shape。
    2. adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
    3. shape=(2708, 2708),
    4. dtype=np.float32)
  • (np.ones(edges.shape[0]):代表稀疏矩阵要填入的值,为1。若邻接矩阵的相应位置被填入1,则说明两个论文中有引用关系。

  • (edges[:, 0], edges[:, 1]):指明了要填入数据的位置,其中edges[:, 0]指明行,edges[:, 1]指明列。
  • shape=(labels.shape[0], labels.shape[0]):指明了adj的shape,为 N × N 的矩阵,其中 N 为样本数。
  • dtype=np.float32:指明了矩阵元素的类型。

最终adj的类型为:scipy.sparse.coo.coo_matrix,使用np.array(adj.todense)将其转为ndarray类型的稠密矩阵后,如下:第2期--基于Pytorch的Kipf的GCN算法实现 - 图3


  1. # build symmetric adjacency matrix
  2. # np.multiply()函数,数组和矩阵对应位置相乘,输出与相乘数组/矩阵的大小一致
  3. adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

得到的邻接矩阵 adj 如下(可与上面的对比)(如果要使用有向图的邻接矩阵,把这一句注释掉就行):第2期--基于Pytorch的Kipf的GCN算法实现 - 图4

  1. adj = normalize(adj + sp.eye(adj.shape[0])) # adj在归一化之前,先引入自环

加入自环后的邻接矩阵adj第2期--基于Pytorch的Kipf的GCN算法实现 - 图5
归一化后的邻接矩阵adj第2期--基于Pytorch的Kipf的GCN算法实现 - 图6


  1. # train set, validation set, test set的分组。
  2. idx_train = range(140)
  3. idx_val = range(200, 500)
  4. idx_test = range(500, 1500)

按照序号划分数据集。这种划分方式并不是论文中的划分方法。论文中是每一类取相同个数 n个样本作为训练集。


  1. # 数据类型转tensor
  2. features = torch.FloatTensor(np.array(features.todense()))
  3. labels = torch.LongTensor(np.where(labels)[1])
  4. adj = sparse_mx_to_torch_sparse_tensor(adj)
  5. idx_train = torch.LongTensor(idx_train)
  6. idx_val = torch.LongTensor(idx_val)
  7. idx_test = torch.LongTensor(idx_test)
  • adj:对于邻接矩阵adj的操作,sparse_mx_to_torch_sparse_tensor(adj),是 Convert a scipy sparse matrix to a torch sparse tensor。具体的细节请看:稀疏矩阵转稀疏张量函数:sparse_mx_to_torch_sparse_tensor(sparse_mx)
  • labels:有一点很有意思,是labels的返回值,这个返回值是长这样的:

第2期--基于Pytorch的Kipf的GCN算法实现 - 图7

  1. tensor([4, 2, 0, ..., 1, 6, 4])
  2. <class 'torch.Tensor'>
  3. torch.Size([2708])

第2期--基于Pytorch的Kipf的GCN算法实现 - 图8


  1. # 返回数据
  2. return adj, features, labels, idx_train, idx_val, idx_test

整个debug load_data()的Demo放到下面了,想尝试的可以拿去用:

  1. import numpy as np
  2. import scipy.sparse as sp
  3. from pygcn.utils import normalize,sparse_mx_to_torch_sparse_tensor,encode_onehot
  4. import torch
  5. '''测试论文编号处理'''
  6. # 读取原始数据集
  7. path="C:/Users/73416/PycharmProjects/PyGCN_Visualization/data/cora/"
  8. dataset = "cora"
  9. idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
  10. dtype=np.dtype(str))
  11. # build graph
  12. # idx_features_labelsidx_features_labels[:, 0]表示取论文编号
  13. idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
  14. # 通过建立论文序号的序列,得到论文序号的字典
  15. idx_map = {j: i for i, j in enumerate(idx)}
  16. # 读取图的边(论文间的引用关系)
  17. # cora.cites共5429行, 每一行有两个论文编号,表示第一个编号的论文先写,第二个编号的论文引用第一个编号的论文。
  18. edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
  19. dtype=np.int32)
  20. # 进行一次论文序号的映射
  21. # 论文编号没有用,需要重新的其进行编号(从0开始),然后对原编号进行替换。
  22. # 所以目的是把离散的原始的编号,变成0 - 2707的连续编号
  23. edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
  24. dtype=np.int32).reshape(edges_unordered.shape)
  25. # coo_matrix():系数矩阵的压缩。分别定义有那些非零元素,以及各个非零元素对应的row和col,最后定义稀疏矩阵的shape。
  26. adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
  27. shape=(2708, 2708),
  28. dtype=np.float32)
  29. # build symmetric adjacency matrix
  30. adj_sysm = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
  31. # 引入自环
  32. adj_sysm_self= adj_sysm + sp.eye(adj.shape[0])
  33. # 归一化
  34. adj_norm = normalize(adj_sysm_self)
  35. features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
  36. features = normalize(features)
  37. labels = encode_onehot(idx_features_labels[:, -1])
  38. # 数据类型转tensor
  39. features = torch.FloatTensor(np.array(features.todense()))
  40. labels = torch.LongTensor(np.where(labels)[1])
  41. adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
  42. # 测试sparse_mx_to_torch_sparse_tensor(sparse_mx)函数
  43. # sparse_mx = adj_norm.tocoo().astype(np.float32)
  44. # indices = torch.from_numpy(
  45. # np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
  46. # values = torch.from_numpy(
  47. # shape = torch.Size(sparse_mx.shape)
  48. # 增加于2020.3.12,返回非对称邻接矩阵,构成有向图
  49. adj_directed_self=adj+ sp.eye(adj.shape[0])
  50. adj_directed_self_matrix=np.array(adj_directed_self.todense())
  51. adj_directed_norm=normalize(adj_directed_self)
  52. adj_directed_norm_matrix=np.array(adj_directed_norm.todense())

计算准确率函数:accuracy(output, labels)

  1. '''计算accuracy'''
  2. def accuracy(output, labels):
  3. preds = output.max(1)[1].type_as(labels)
  4. correct = preds.eq(labels).double()
  5. correct = correct.sum()
  6. return correct / len(labels)


  • output为模型model直接的输出,并不是单个的标签(获取预测类别的操作在accuracy(output, labels))中的preds = output.max(1)[1].type_as(labels)实现)。其信息为:

    1. tensor([[-5.3865, -5.8370, -5.6641, ..., -0.0546, -4.6866, -5.4952],
    2. [-1.9110, -3.6502, -0.8442, ..., -3.3036, -1.5383, -2.0366],
    3. [-0.1619, -3.4708, -3.5892, ..., -3.9754, -3.4787, -3.3948],
    4. ...,
    5. [-1.9098, -0.5042, -3.1999, ..., -3.0369, -3.6273, -2.5525],
    6. [-2.6523, -2.9252, -2.6154, ..., -3.0894, -3.3290, -0.3564],
    7. [-4.6700, -4.5324, -4.5864, ..., -0.0916, -4.3737, -3.9876]],
    8. device='cuda:0', grad_fn=<LogSoftmaxBackward>)
    9. <class 'torch.Tensor'>
    10. torch.Size([2708, 7])
  • labels的传入形式:

    1. [4 2 0 ... 1 6 4]
    2. <class 'numpy.ndarray'>
    3. (2708,)



    1. '''稀疏矩阵转稀疏张量'''
    2. def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    3. """Convert a scipy sparse matrix to a torch sparse tensor."""
    4. sparse_mx = sparse_mx.tocoo().astype(np.float32)
    5. indices = torch.from_numpy(
    6. np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    7. values = torch.from_numpy(
    8. shape = torch.Size(sparse_mx.shape)
    9. return torch.sparse.FloatTensor(indices, values, shape)

    第2期--基于Pytorch的Kipf的GCN算法实现 - 图9
    sparse_mx = sparse_mx.tocoo().astype(np.float32)之后的sparse_mx是长这样的:

第2期--基于Pytorch的Kipf的GCN算法实现 - 图10
也就是说,矩阵还是那个矩阵,只不过通过.tocoo()将矩阵的形式变成了COOrdinate format。

  1. csr_matrix.tocoo(*self*, *copy=True*)
  2. Convert this matrix to COOrdinate format.
  3. With copy=False, the data/indices may be shared between this matrix and the resultant coo_matrix.

indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))这一句是提取稀疏矩阵的非零元素的索引。得到的矩阵是一个[2, 8137]的tensor。
第2期--基于Pytorch的Kipf的GCN算法实现 - 图11
values = torch.from_numpy( = torch.Size(sparse_mx.shape) 这两行就是规定了数值和shape。没什么好说的。

  1. return torch.sparse.FloatTensor(indices, values, shape)

函数返回值应该注意一下。该函数的返回值的类型是 torch.Tensor

  1. print(torch.Tensor)
  2. <class 'torch.Tensor'>

第2期--基于Pytorch的Kipf的GCN算法实现 - 图12

  1. Torch supports sparse tensors in COO(rdinate) format, which can efficiently store and process tensors for which the majority of elements are zeros.
  2. A sparse tensor is represented as a pair of dense tensors: a tensor of values and a 2D tensor of indices. A sparse tensor can be constructed by providing these two tensors, as well as the size of the sparse tensor (which cannot be inferred from these tensors!)


  1. import math
  2. import torch
  3. from torch.nn.parameter import Parameter
  4. from torch.nn.modules.module import Module
  5. class GraphConvolution(Module):
  6. """
  7. Simple GCN layer, similar to
  8. """
  9. '''定义对象的属性'''
  10. def __init__(self, in_features, out_features, bias=True):
  11. super(GraphConvolution, self).__init__()
  12. self.in_features = in_features
  13. self.out_features = out_features
  14. self.weight = Parameter(torch.FloatTensor(in_features, out_features)) # in_features × out_features
  15. if bias:
  16. self.bias = Parameter(torch.FloatTensor(out_features))
  17. else:
  18. self.register_parameter('bias', None)
  19. self.reset_parameters()
  20. '''生成权重'''
  21. def reset_parameters(self):
  22. stdv = 1. / math.sqrt(self.weight.size(1))
  23., stdv) # .uniform():将tensor用从均匀分布中抽样得到的值填充。
  24. if self.bias is not None:
  25., stdv)
  26. '''前向传播 of 一层之内:即本层的计算方法:A_hat * X * W '''
  27. def forward(self, input, adj):
  28. support =, self.weight) # multiply,input和weight实现矩阵点乘。
  29. output = torch.spmm(adj, support) # torch.spmm:稀疏矩阵乘法,sp即sparse。
  30. if self.bias is not None:
  31. return output + self.bias
  32. else:
  33. return output
  34. '''把一个对象用字符串的形式表达出来以便辨认,在终端调用的时候会显示信息'''
  35. def __repr__(self):
  36. return self.__class__.__name__ + ' (' \
  37. + str(self.in_features) + ' -> ' \
  38. + str(self.out_features) + ')'


  1. '''定义对象的属性'''
  2. def __init__(self, in_features, out_features, bias=True):
  3. super(GraphConvolution, self).__init__()
  4. self.in_features = in_features
  5. self.out_features = out_features
  6. self.weight = Parameter(torch.FloatTensor(in_features, out_features)) # in_features × out_features
  7. if bias:
  8. self.bias = Parameter(torch.FloatTensor(out_features))
  9. else:
  10. self.register_parameter('bias', None)
  11. self.reset_parameters()


  • 设定该层中in_featuresout_features
  • 参数的初始化,通过该对象的reset_parameters()方法实现。参数包括:
    • weight:维度为in_features × out_features
    • bias ( if True ):维度为out_features ```python ‘’’生成权重’’’ def resetparameters(self): stdv = 1. / math.sqrt(self.weight.size(1)), stdv) # .uniform():将tensor用从均匀分布中抽样得到的值填充。 if self.bias is not None:, stdv)
  1. 就是随机生成权重,不细说了。<br />但是有一点,生成随机数的种子是可以认为设定的([设定随机数种子](,即可以每次初始化得到相同的初始化参数,从而使得结果可复现。
  2. ```python
  3. '''前向传播 of 一层之内:即本层的计算方法:A * X * W '''
  4. def forward(self, input, adj):
  5. support =, self.weight) # multiply,input和weight实现矩阵点乘。
  6. output = torch.spmm(adj, support) # torch.spmm:稀疏矩阵乘法,sp即sparse。
  7. if self.bias is not None:
  8. return output + self.bias
  9. else:
  10. return output

这一层是定义的本层的前向传播,即本层的计算方法:A ∗ X ∗ W
support =, self.weight)inputweight实现矩阵乘法,即support = X ∗ W。
output = torch.spmm(adj, support),由于adj是torch.sparse的对象,所以要使用稀疏矩阵乘法torch.spmm(),实现的功能是得到outpuy=Asupport
然后再加上bias ( if True ),就得到了本层最后的输出。

  1. '''把一个对象用字符串的形式表达出来以便辨认,在终端调用的时候会显示信息'''
  2. def __repr__(self):
  3. return self.__class__.__name__ + ' (' \
  4. + str(self.in_features) + ' -> ' \
  5. + str(self.out_features) + ')'


  1. import torch.nn as nn
  2. import torch.nn.functional as F
  3. from pygcn.layers import GraphConvolution
  4. '''GCN类'''
  5. class GCN(nn.Module):
  6. def __init__(self, nfeat, nhid, nclass, dropout):
  7. super(GCN, self).__init__()
  8. self.gc1 = GraphConvolution(nfeat, nhid) # 第一层
  9. self.gc2 = GraphConvolution(nhid, nclass) # 第二层
  10. self.dropout = dropout # 定义dropout
  11. '''前向传播 of 层间:整个网络的前向传播的方式:relu(gc1) --> dropout --> gc2 --> log_softmax'''
  12. def forward(self, x, adj):
  13. x = F.relu(self.gc1(x, adj))
  14. x = F.dropout(x, self.dropout,
  15. x = self.gc2(x, adj)
  16. return F.log_softmax(x, dim=1)

class GCN(nn.Module)定义了一个图卷积神经网络,在这里有两个卷积层。

  1. def __init__(self, nfeat, nhid, nclass, dropout):
  2. super(GCN, self).__init__()
  3. self.gc1 = GraphConvolution(nfeat, nhid) # 第一层
  4. self.gc2 = GraphConvolution(nhid, nclass) # 第二层
  5. self.dropout = dropout
  • gc1in_feature = nfeat,为数据的原始的feature。out_feature = nhid。
  • gc2in_feature = nhid。out_feature = nclass,为最后待分类的类别数。
  • dropout


  1. '''前向传播 of 层间:整个网络的前向传播的方式:relu(gc1) --> dropout --> gc2 --> log_softmax'''
  2. def forward(self, x, adj):
  3. x = F.relu(self.gc1(x, adj))
  4. x = F.dropout(x, self.dropout,
  5. x = self.gc2(x, adj)
  6. return F.log_softmax(x, dim=1)

整个网络的前向传播:整个网络的前向传播的方式:relu(gc1) —> dropout —> gc2 —> log_softmax

  1. from __future__ import division
  2. from __future__ import print_function
  3. import time
  4. import argparse
  5. import numpy as np
  6. import torch
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. from utils import load_data, load_data2, accuracy
  10. from models import GCN
  11. # Training settings
  12. parser = argparse.ArgumentParser()
  13. parser.add_argument('--no-cuda', action='store_true', default=False,
  14. help='Disables CUDA training.')
  15. parser.add_argument('--fastmode', action='store_true', default=False,
  16. help='Validate during training pass.')
  17. parser.add_argument('--seed', type=int, default=42, help='Random seed.')
  18. parser.add_argument('--epochs', type=int, default=100,
  19. help='Number of epochs to train.')
  20. parser.add_argument('--lr', type=float, default=0.01,
  21. help='Initial learning rate.')
  22. parser.add_argument('--weight_decay', type=float, default=5e-4,
  23. help='Weight decay (L2 loss on parameters).')
  24. parser.add_argument('--hidden', type=int, default=16,
  25. help='Number of hidden units.')
  26. parser.add_argument('--dropout', type=float, default=0.5,
  27. help='Dropout rate (1 - keep probability).')
  28. args = parser.parse_args()
  29. args.cuda = not args.no_cuda and torch.cuda.is_available()#作为是否使用cpu的判定
  30. #设计随机数种子
  31. np.random.seed(args.seed)
  32. torch.manual_seed(args.seed)
  33. if args.cuda:
  34. torch.cuda.manual_seed(args.seed)
  35. # Load data
  36. adj, features, labels, idx_train, idx_val, idx_test = load_data()
  37. #adj, features, labels, idx_train, idx_val, idx_test = load_data2("BlogCatalog")
  38. # Model and optimizer
  39. # Model
  40. model = GCN(nfeat=features.shape[1],
  41. nhid=args.hidden,
  42. nclass=labels.max().item() + 1, # 对Cora数据集,为7,即类别总数。
  43. dropout=args.dropout)
  44. optimizer = optim.Adam(model.parameters(),
  45., weight_decay=args.weight_decay)
  46. if args.cuda:
  47. model.cuda()
  48. features = features.cuda()
  49. adj = adj.cuda()
  50. labels = labels.cuda()
  51. idx_train = idx_train.cuda()
  52. idx_val = idx_val.cuda()
  53. idx_test = idx_test.cuda()
  54. def train(epoch):
  55. t = time.time()
  56. '''将模型转为训练模式,并将优化器梯度置零'''
  57. model.train()
  58. optimizer.zero_grad()
  59. '''计算输出时,对所有的节点计算输出'''
  60. output = model(features, adj)
  61. '''损失函数,仅对训练集节点计算,即:优化仅对训练集数据进行'''
  62. loss_train = F.nll_loss(output[idx_train], labels[idx_train])
  63. # 计算准确率
  64. acc_train = accuracy(output[idx_train], labels[idx_train])
  65. # 反向传播
  66. loss_train.backward()
  67. # 优化
  68. optimizer.step()
  69. '''fastmode ? '''
  70. if not args.fastmode:
  71. # Evaluate validation set performance separately,
  72. # deactivates dropout during validation run.
  73. model.eval()
  74. output = model(features, adj)
  75. '''验证集 loss 和 accuracy '''
  76. loss_val = F.nll_loss(output[idx_val], labels[idx_val])
  77. acc_val = accuracy(output[idx_val], labels[idx_val])
  78. '''输出训练集+验证集的 loss 和 accuracy '''
  79. print('Epoch: {:04d}'.format(epoch+1),
  80. 'loss_train: {:.4f}'.format(loss_train.item()),
  81. 'acc_train: {:.4f}'.format(acc_train.item()),
  82. 'loss_val: {:.4f}'.format(loss_val.item()),
  83. 'acc_val: {:.4f}'.format(acc_val.item()),
  84. 'time: {:.4f}s'.format(time.time() - t))
  85. def test():
  86. model.eval() # model转为测试模式
  87. output = model(features, adj)
  88. loss_test = F.nll_loss(output[idx_test], labels[idx_test])
  89. acc_test = accuracy(output[idx_test], labels[idx_test])
  90. print("Test set results:",
  91. "loss= {:.4f}".format(loss_test.item()),
  92. "accuracy= {:.4f}".format(acc_test.item()))
  93. # return output # 可视化返回output
  94. # Train model
  95. t_total = time.time()
  96. for epoch in range(args.epochs):
  97. train(epoch)
  98. print("Optimization Finished!")
  99. print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
  100. # Testing
  101. test()


  1. from __future__ import division
  2. from __future__ import print_function
  3. import time
  4. import argparse
  5. import numpy as np
  6. import torch
  7. import torch.nn.functional as F
  8. import torch.optim as optim
  9. from utils import load_data, load_data2, accuracy
  10. from models import GCN


  1. from __future__ import division
  2. from __future__ import print_function
  • 第一条语句:
    在 Python2 中导入未来的支持的语言特征中division (精确除法),即from __future__ import division ,当我们在程序中没有导入该特征时,”/“操作符执行的只能是整除,也就是取整数,只有当我们导入division(精确算法)以后,”/“执行的才是精确算法。
  • 第二条语句:
    在开头加上from __future__ import print_function这句之后,即使在python2.X,使用print就得像python3.X那样加括号使用。python2.X中print不需要括号,而在python3.X中则需要。



    Training settings

    parser = argparse.ArgumentParser() parser.add_argument(‘—no-cuda’, action=’store_true’, default=False,
    1. help='Disables CUDA training.')
    parser.add_argument(‘—fastmode’, action=’store_true’, default=False,
    1. help='Validate during training pass.')
    parser.add_argument(‘—seed’, type=int, default=42, help=’Random seed.’) parser.add_argument(‘—epochs’, type=int, default=100,
    1. help='Number of epochs to train.')
    parser.add_argument(‘—lr’, type=float, default=0.01,
    1. help='Initial learning rate.')
    parser.add_argument(‘—weight_decay’, type=float, default=5e-4,
    1. help='Weight decay (L2 loss on parameters).')
    parser.add_argument(‘—hidden’, type=int, default=16,
    1. help='Number of hidden units.')
    parser.add_argument(‘—dropout’, type=float, default=0.5,
    1. help='Dropout rate (1 - keep probability).')

args = parser.parse_args()#生成超参数 args.cuda = not args.no_cuda and torch.cuda.is_available()#作为是否使用cpu的判定

  1. [Python- argparse.ArgumentParser()用法解析](<br />[parse_args(argsparse):python和命令行之间的交互](
  2. <a name="QdVYW"></a>
  3. ## 设计随机数种子
  4. ```python
  5. #设计随机数种子
  6. np.random.seed(args.seed)
  7. torch.manual_seed(args.seed)
  8. if args.cuda:
  9. torch.cuda.manual_seed(args.seed)


  1. # 随机数一样
  2. random.seed(1)
  3. print('随机数3:',random.random())
  4. random.seed(1)
  5. print('随机数4:',random.random())
  6. random.seed(2)
  7. print('随机数5:',random.random())
  8. '''
  9. 随机数1: 0.7643602170615428
  10. 随机数2: 0.31630323818329664
  11. 随机数3: 0.13436424411240122
  12. 随机数4: 0.13436424411240122
  13. 随机数5: 0.9560342718892494
  14. '''

比如下面的 Demo:

  1. torch.manual_seed(2) #为CPU设置种子用于生成随机数,以使得结果是确定的
  2. print(torch.rand(2))
  3. if args.cuda:
  4. torch.cuda.manual_seed(args.seed) #为当前GPU设置随机种子;
  5. # 如果使用多个GPU,应该使用 torch.cuda.manual_seed_all()为所有的GPU设置种子。



  1. # Load data
  2. adj, features, labels, idx_train, idx_val, idx_test = load_data()


  • adj:是torch.sparse,已进行归一化
  • features:归一化后的特征
  • labelsint类型的标签,注意并不是onehot编码的形式。具体如下:

    1. tensor([4, 2, 0, ..., 1, 6, 4])
    2. <class 'torch.Tensor'>
    3. torch.Size([2708])
  • idx_trainidx_validx_test:训练集、验证集、测试集中样本的序号。


    1. # Model
    2. model = GCN(nfeat=features.shape[1],
    3. nhid=args.hidden,
    4. nclass=labels.max().item() + 1, # 对Cora数据集,为7,即类别总数。
    5. dropout=args.dropout)



    1. # optimizer
    2. optimizer = optim.Adam(model.parameters(),
    3., weight_decay=args.weight_decay)



    1. # to CUDA
    2. if args.cuda:
    3. model.cuda()
    4. features = features.cuda()
    5. adj = adj.cuda()
    6. labels = labels.cuda()
    7. idx_train = idx_train.cuda()
    8. idx_val = idx_val.cuda()
    9. idx_test = idx_test.cuda()


    1. def train(epoch):
    2. t = time.time()
    3. '''将模型转为训练模式,并将优化器梯度置零'''
    4. model.train()
    5. optimizer.zero_grad()
    6. '''计算输出时,对所有的节点计算输出'''
    7. output = model(features, adj)
    8. '''损失函数,仅对训练集节点计算,即:优化仅对训练集数据进行'''
    9. loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    10. # 计算准确率
    11. acc_train = accuracy(output[idx_train], labels[idx_train])
    12. # 反向传播
    13. loss_train.backward()
    14. # 优化
    15. optimizer.step()
    16. '''fastmode ? '''
    17. if not args.fastmode:
    18. # Evaluate validation set performance separately,
    19. # deactivates dropout during validation run.
    20. model.eval()
    21. output = model(features, adj)
    22. '''验证集 loss 和 accuracy '''
    23. loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    24. acc_val = accuracy(output[idx_val], labels[idx_val])
    25. '''输出训练集+验证集的 loss 和 accuracy '''
    26. print('Epoch: {:04d}'.format(epoch+1),
    27. 'loss_train: {:.4f}'.format(loss_train.item()),
    28. 'acc_train: {:.4f}'.format(acc_train.item()),
    29. 'loss_val: {:.4f}'.format(loss_val.item()),
    30. 'acc_val: {:.4f}'.format(acc_val.item()),
    31. 'time: {:.4f}s'.format(time.time() - t))



    1. '''将模型转为训练模式,并将优化器梯度置零'''
    2. model.train()
    3. optimizer.zero_grad()


    1. '''计算输出时,对所有的节点计算输出'''
    2. output = model(features, adj)



    1. '''损失函数,仅对训练集节点计算,即:优化仅对训练集数据进行'''
    2. loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    3. # 计算准确率
    4. acc_train = accuracy(output[idx_train], labels[idx_train])



    1. # 反向传播
    2. loss_train.backward()
    3. # 优化
    4. optimizer.step()

    通过计算训练集损失和反向传播及优化,带标签的 label 信息就可以 smooth 到整个图上(label information is smoothed over the graph)。


    1. '''验证集 loss 和 accuracy '''
    2. loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    3. acc_val = accuracy(output[idx_val], labels[idx_val])


    1. '''输出训练集+验证集的 loss 和 accuracy '''
    2. print('Epoch: {:04d}'.format(epoch+1),
    3. 'loss_train: {:.4f}'.format(loss_train.item()),
    4. 'acc_train: {:.4f}'.format(acc_train.item()),
    5. 'loss_val: {:.4f}'.format(loss_val.item()),
    6. 'acc_val: {:.4f}'.format(acc_val.item()),
    7. 'time: {:.4f}s'.format(time.time() - t))


    1. def test():
    2. model.eval() # model转为测试模式
    3. output = model(features, adj)
    4. loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    5. acc_test = accuracy(output[idx_test], labels[idx_test])
    6. print("Test set results:",
    7. "loss= {:.4f}".format(loss_test.item()),
    8. "accuracy= {:.4f}".format(acc_test.item()))



    1. # Train model
    2. t_total = time.time()
    3. for epoch in range(args.epochs):
    4. train(epoch)
    5. print("Optimization Finished!")
    6. print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

    调用epochs次循环,其中 train(epoch) 是一次训练。


    1. # Testing
    2. test()

