completed gcn.py

hcnoh · hcnoh · commit 04a9f8d0dfe8 · 2021-12-02T13:03:12.000+09:00
diff --git a/config.json b/config.json
@@ -1,6 +1,6 @@
 {
     "train_config": {
-        "num_epochs": 400,
+        "num_epochs": 200,
         "learning_rate": 0.01
     },
     "model_config": {
diff --git a/data_loaders/citation_networks.py b/data_loaders/citation_networks.py
@@ -11,15 +11,18 @@
 
 
 class CitationNetworks(Dataset):
-    def __init__(self, dataset_dir=DATASET_DIR, directed=False) -> None:
+    def __init__(self, dataset_dir=DATASET_DIR) -> None:
         super().__init__()
 
-        self.dataset_name = None    # will be defined in child classes
+        # will be defined in child classes
+        self.dataset_name = None
+        self.directed = None
+        self.num_features = None
 
         self.dataset_dir = dataset_dir
-        self.directed = directed
 
-        self.num_sample_per_class = 20
+        self.num_train_samples_per_class = 20
+        self.num_test_samples = 1000
 
     def __getitem__(self, index):
         return self.X[index], self.Y[index]
@@ -28,6 +31,11 @@ def __len__(self):
         return self.num_nodes
 
     def preprocess(self):
+        '''
+            The preprocess methods are from the following references:
+            - http://proceedings.mlr.press/v48/yanga16.pdf
+            - https://arxiv.org/pdf/1609.02907.pdf
+        '''
         cites_path = os.path.join(
             self.dataset_dir, "{}.cites".format(self.dataset_name)
         )
@@ -42,21 +50,24 @@ def preprocess(self):
             self.dataset_dir, "{}.content".format(self.dataset_name)
         )
 
-        col_names = ["Node"] + list(range(3703)) + ["Label"]
+        col_names = ["Node"] + list(range(self.num_features)) + ["Label"]
 
         content_df = pd.read_csv(
             content_path, sep="\t", names=col_names, header=None
         )
-        content_df["Feature"] = content_df[range(3703)].agg(list, axis=1)
+        content_df["Feature"] = content_df[range(self.num_features)]\
+            .agg(list, axis=1)
         content_df = content_df[["Node", "Feature", "Label"]]
 
         node_list = np.array([str(node) for node in content_df["Node"].values])
         node2idx = {node: idx for idx, node in enumerate(node_list)}
         num_nodes = node_list.shape[0]
 
+        # Row normalization for the feature matrix
         X = np.array(
             [np.array(feature) for feature in content_df["Feature"].values]
         )
+        X = X / np.sum(X, axis=-1, keepdims=True)
         num_feature_maps = X.shape[-1]
 
         class_list = np.unique(content_df["Label"].values)
@@ -69,16 +80,17 @@ def preprocess(self):
         drop_indices = []
 
         for i, row in cites_df.iterrows():
-            if row["To"] not in node_list or row["From"] not in node_list:
+            if str(row["To"]) not in node_list or \
+                    str(row["From"]) not in node_list:
                 drop_indices.append(i)
 
         cites_df = cites_df.drop(drop_indices)
 
         A = np.zeros([num_nodes, num_nodes])
 
         for _, row in cites_df.iterrows():
-            to_ = row["To"]
-            from_ = row["From"]
+            to_ = str(row["To"])
+            from_ = str(row["From"])
 
             A[node2idx[to_], node2idx[from_]] = 1
             if not self.directed:
@@ -104,23 +116,69 @@ def preprocess(self):
 
         train_indices = np.hstack(
             [
-                np.random.choice(v, self.num_sample_per_class)
+                np.random.choice(v, self.num_train_samples_per_class)
                 for _, v in class2indices.items()
             ]
         )
         test_indices = np.delete(np.arange(num_nodes), train_indices)
+        test_indices = np.random.choice(test_indices, self.num_test_samples)
 
         return A, A_hat, X, Y, node_list, node2idx, num_nodes, \
             num_feature_maps, class_list, class2idx, num_classes, \
             class2indices, train_indices, test_indices
 
 
 class Citeseer(CitationNetworks):
-    def __init__(self) -> None:
+    def __init__(self, directed) -> None:
         super().__init__()
 
+        self.directed = directed
+
+        self.num_features = 3703
+
         self.dataset_name = "citeseer"
         self.dataset_dir = os.path.join(self.dataset_dir, self.dataset_name)
+        if self.directed:
+            self.preprocessed_dir = os.path.join(
+                self.dataset_dir, "directed"
+            )
+        else:
+            self.preprocessed_dir = os.path.join(
+                self.dataset_dir, "undirected"
+            )
+        print(self.preprocessed_dir)
+
+        if not os.path.exists(self.preprocessed_dir):
+            os.mkdir(self.preprocessed_dir)
+
+        if os.path.exists(os.path.join(self.preprocessed_dir, "dataset.pkl")):
+            with open(
+                os.path.join(self.preprocessed_dir, "dataset.pkl"), "rb"
+            ) as f:
+                dataset = pickle.load(f)
+        else:
+            dataset = self.preprocess()
+            with open(
+                os.path.join(self.preprocessed_dir, "dataset.pkl"), "wb"
+            ) as f:
+                pickle.dump(dataset, f)
+
+        self.A, self.A_hat, self.X, self.Y, self.node_list, self.node2idx, \
+            self.num_nodes, self.num_feature_maps, self.class_list, \
+            self.class2idx, self.num_classes, self.class2indices, \
+            self.train_indices, self.test_indices = dataset
+
+
+class Cora(CitationNetworks):
+    def __init__(self, directed) -> None:
+        super().__init__()
+
+        self.directed = directed
+
+        self.num_features = 1433
+
+        self.dataset_name = "cora"
+        self.dataset_dir = os.path.join(self.dataset_dir, self.dataset_name)
         if self.directed:
             self.preprocessed_dir = os.path.join(
                 self.dataset_dir, "directed"
diff --git a/models/gcn.py b/models/gcn.py
@@ -1,6 +1,11 @@
+import os
+
+import numpy as np
 import torch
 
 from torch.nn import Module, Linear, Dropout
+from torch.nn.init import xavier_normal_
+from torch.nn.functional import cross_entropy
 from torch.sparse import mm
 from torch.optim import Adam
 
@@ -35,42 +40,111 @@ def __init__(self, A_hat, C, H, F, num_layers, dropout, regularization):
         self.W0 = Linear(self.C, self.H, bias=False)
         self.W1 = Linear(self.H, self.F, bias=False)
 
+        xavier_normal_(self.W0.weight)
+        xavier_normal_(self.W1.weight)
+
         self.Wh = [
             Linear(self.H, self.H, bias=False)
             for _ in range(self.num_layers - 2)
         ]
 
+        for Wh in self.Wh:
+            xavier_normal_(Wh.weight)
+
         self.dropout_layer = Dropout(self.dropout)
 
         self.L2 = torch.sum(
             FloatTensor([torch.norm(param) for param in self.W0.parameters()])
         )
 
-    def forward(self, X):
+    def get_logits(self, X):
         Z = self.dropout_layer(torch.relu(mm(self.A_hat, self.W0(X))))
         for Wh in self.Wh:
             Z = torch.relu(mm(self.A_hat, Wh(Z)))
-        Z = self.dropout_layer(
-            torch.softmax(mm(self.A_hat, self.W1(Z)), dim=-1)
-        )
+        Z = self.dropout_layer(mm(self.A_hat, self.W1(Z)))
 
         return Z
 
+    def forward(self, X):
+        Z = self.get_logits(X)
+
+        return torch.softmax(Z, dim=-1)
+
     def train_model(
-        self, num_epochs, learning_rate, dataset, train_indices, test_indices
+        self, num_epochs, learning_rate, dataset, train_indices, test_indices,
+        ckpt_path
     ):
+        accs = []
+        train_losses = []
+        test_losses = []
+
+        max_acc = 0
+
         opt = Adam(self.parameters(), learning_rate)
 
         X = FloatTensor(dataset.X)
 
         for i in range(1, num_epochs + 1):
-            self.eval()
+            self.train()
 
             _, Y = dataset[train_indices]
-            Y = FloatTensor(Y)
+            Y = LongTensor(Y)
 
             Z = torch.gather(
-                self(X), dim=0, index=LongTensor(train_indices).unsqueeze(-1).repeat(1, self.F)
+                self.get_logits(X),
+                dim=0,
+                index=LongTensor(train_indices)
+                .unsqueeze(-1).repeat(1, self.F)
             )
 
-            print(self(X).shape, Z.shape, train_indices.shape, self(X)[train_indices[0]] == Z[0])
+            opt.zero_grad()
+            train_loss = cross_entropy(Z, Y)
+            (train_loss + self.regularization * self.L2).backward()
+            opt.step()
+
+            train_loss = train_loss.detach().cpu().numpy()
+
+            train_losses.append(train_loss)
+
+            with torch.no_grad():
+                self.eval()
+
+                _, Y = dataset[test_indices]
+                Y = LongTensor(Y)
+
+                Z = torch.gather(
+                    self.get_logits(X),
+                    dim=0,
+                    index=LongTensor(test_indices)
+                    .unsqueeze(-1).repeat(1, self.F)
+                )
+
+                test_loss = cross_entropy(Z, Y)
+                test_loss = test_loss.detach().cpu().numpy()
+
+                test_losses.append(test_loss)
+
+                Y = Y.detach().cpu().numpy()
+
+                Z = torch.softmax(Z, dim=-1).detach().cpu().numpy()
+                Z = np.argmax(Z, axis=-1)
+
+                acc = np.mean(Y == Z)
+
+                accs.append(acc)
+
+                print(
+                    "Epoch: {}, Train Loss: {}, Test Loss: {}, Test ACC: {}"
+                    .format(i, train_loss, test_loss, acc)
+                )
+
+                if acc > max_acc:
+                    torch.save(
+                        self.state_dict(),
+                        os.path.join(
+                            ckpt_path, "model.ckpt"
+                        )
+                    )
+                    max_acc = acc
+
+        return accs, train_losses, test_losses
diff --git a/train.py b/train.py
@@ -5,9 +5,7 @@
 
 import torch
 
-from torch.utils.data import DataLoader, random_split
-
-from data_loaders.citation_networks import Citeseer
+from data_loaders.citation_networks import Citeseer, Cora
 
 from models.gcn import GCN
 
@@ -44,7 +42,9 @@ def main(dataset_name, directed):
     regularization = model_config["regularization"]
 
     if dataset_name == "citeseer":
-        dataset = Citeseer()
+        dataset = Citeseer(directed=directed)
+    elif dataset_name == "cora":
+        dataset = Cora(directed=directed)
 
     if torch.cuda.is_available():
         device = "cuda"
@@ -59,32 +59,19 @@ def main(dataset_name, directed):
     model = GCN(
         dataset.A_hat, dataset.num_feature_maps, H, dataset.num_classes,
         num_layers, dropout, regularization
-    )
+    ).to(device)
 
-    model.train_model(
+    accs, train_losses, test_losses = model.train_model(
         num_epochs, learning_rate, dataset, dataset.train_indices,
-        dataset.test_indices
+        dataset.test_indices, ckpt_path
     )
 
-    # train_size = dataset.train_indices.shape[0]
-    # test_size = dataset.test_indices.shape[0]
-
-    # train_dataset, test_dataset = random_split(
-    #     dataset, [train_size, test_size]
-    # )
-
-    # train_dataset.indices = dataset.train_indices
-    # test_dataset.indices = dataset.test_indices
-
-    # train_loader = DataLoader(
-    #     train_dataset, batch_size=train_size, shuffle=False
-    # )
-    # test_loader = DataLoader(
-    #     test_dataset, batch_size=test_size, shuffle=False
-    # )
-
-    # print(train_dataset.indices)
-    # print(train_loader.indices)
+    with open(os.path.join(ckpt_path, "accs.pkl"), "wb") as f:
+        pickle.dump(accs, f)
+    with open(os.path.join(ckpt_path, "train_losses.pkl"), "wb") as f:
+        pickle.dump(train_losses, f)
+    with open(os.path.join(ckpt_path, "test_losses.pkl"), "wb") as f:
+        pickle.dump(test_losses, f)
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"train_config": {`
`3`		`- "num_epochs": 400,`
	`3`	`+ "num_epochs": 200,`
`4`	`4`	`"learning_rate": 0.01`
`5`	`5`	`},`
`6`	`6`	`"model_config": {`