apache · acezen · Feb 27, 2023 · Feb 27, 2023 · Feb 27, 2023 · Feb 27, 2023
diff --git a/spark/src/main/scala/com/alibaba/graphar/reader/VertexReader.scala b/spark/src/main/scala/com/alibaba/graphar/reader/VertexReader.scala
@@ -18,8 +18,8 @@ package com.alibaba.graphar.reader
 import com.alibaba.graphar.utils.{IndexGenerator, DataFrameConcat}
 import com.alibaba.graphar.{GeneralParams, VertexInfo, FileType, PropertyGroup}
 import com.alibaba.graphar.datasources._
+import com.alibaba.graphar.utils.FileSystem
 
-import org.apache.hadoop.fs.{Path, FileSystem}
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.functions._
@@ -36,11 +36,7 @@ class VertexReader(prefix: String, vertexInfo: VertexInfo, spark: SparkSession)
   /** Load the total number of vertices for this vertex type. */
   def readVerticesNumber(): Long = {
     val file_path = prefix + "/" + vertexInfo.getVerticesNumFilePath()
-    val path = new Path(file_path)
-    val file_system = FileSystem.get(path.toUri(), spark.sparkContext.hadoopConfiguration)
-    val input = file_system.open(path)
-    val number = java.lang.Long.reverseBytes(input.readLong())
-    file_system.close()
+    val number = FileSystem.readValue(file_path, spark.sparkContext.hadoopConfiguration)
     return number
   }
 

diff --git a/spark/src/main/scala/com/alibaba/graphar/utils/FileSystem.scala b/spark/src/main/scala/com/alibaba/graphar/utils/FileSystem.scala
@@ -68,7 +68,18 @@ object FileSystem {
     val path = new Path(outputPath)
     val fs = path.getFileSystem(hadoopConfig)
     val output = fs.create(path, true)  // create or overwrite
-    output.writeLong(value)
+    // consistent with c++ library, convert to little-endian
+    output.writeLong(java.lang.Long.reverseBytes(value))
     output.close()
   }
+
+  def readValue(inputPath: String, hadoopConfig: Configuration): Long = {
+    val path = new Path(inputPath)
+    val fs = path.getFileSystem(hadoopConfig)
+    val input = fs.open(path)
+    // consistent with c++ library, little-endian in file, convert to big-endian
+    val num = java.lang.Long.reverseBytes(input.readLong())
+    fs.close()
+    return num
+  }
 }
diff --git a/spark/src/main/scala/com/alibaba/graphar/writer/VertexWriter.scala b/spark/src/main/scala/com/alibaba/graphar/writer/VertexWriter.scala
@@ -60,6 +60,7 @@ class VertexWriter(prefix: String, vertexInfo: VertexInfo, vertexDf: DataFrame,
     case None => vertexDf.count()
     case _ => numVertices.get
   }
+  writeVertexNum()
 
   private var chunks:DataFrame = VertexWriter.repartitionAndSort(vertexDf, vertexInfo.getChunk_size(), vertexNum)
 

diff --git a/spark/src/test/scala/com/alibaba/graphar/TestWriter.scala b/spark/src/test/scala/com/alibaba/graphar/TestWriter.scala
@@ -17,6 +17,7 @@ package com.alibaba.graphar
 
 import com.alibaba.graphar.utils.IndexGenerator
 import com.alibaba.graphar.writer.{VertexWriter, EdgeWriter}
+import com.alibaba.graphar.utils
 
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.scalatest.funsuite.AnyFunSuite
@@ -61,6 +62,9 @@ class WriterSuite extends AnyFunSuite {
     val chunk_path = new Path(prefix + vertex_info.getPrefix() + "*/*")
     val chunk_files = fs.globStatus(chunk_path)
     assert(chunk_files.length == 20)
+    val vertex_num_path = prefix + vertex_info.getVerticesNumFilePath()
+    val number = utils.FileSystem.readValue(vertex_num_path, spark.sparkContext.hadoopConfiguration)
+    assert(number.toInt == vertex_df.count())
 
     assertThrows[IllegalArgumentException](new VertexWriter(prefix, vertex_info, vertex_df))
     val invalid_property_group= new PropertyGroup()