Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BugFix][Spark] Fix the bug that VertexWrite does not generate vertex count file #110

Merged
merged 4 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ package com.alibaba.graphar.reader
import com.alibaba.graphar.utils.{IndexGenerator, DataFrameConcat}
import com.alibaba.graphar.{GeneralParams, VertexInfo, FileType, PropertyGroup}
import com.alibaba.graphar.datasources._
import com.alibaba.graphar.utils.FileSystem

import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
Expand All @@ -36,11 +36,7 @@ class VertexReader(prefix: String, vertexInfo: VertexInfo, spark: SparkSession)
/** Load the total number of vertices for this vertex type. */
def readVerticesNumber(): Long = {
val file_path = prefix + "/" + vertexInfo.getVerticesNumFilePath()
val path = new Path(file_path)
val file_system = FileSystem.get(path.toUri(), spark.sparkContext.hadoopConfiguration)
val input = file_system.open(path)
val number = java.lang.Long.reverseBytes(input.readLong())
file_system.close()
val number = FileSystem.readValue(file_path, spark.sparkContext.hadoopConfiguration)
return number
}

Expand Down
13 changes: 12 additions & 1 deletion spark/src/main/scala/com/alibaba/graphar/utils/FileSystem.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,18 @@ object FileSystem {
val path = new Path(outputPath)
val fs = path.getFileSystem(hadoopConfig)
val output = fs.create(path, true) // create or overwrite
output.writeLong(value)
// consistent with c++ library, convert to little-endian
output.writeLong(java.lang.Long.reverseBytes(value))
output.close()
}

def readValue(inputPath: String, hadoopConfig: Configuration): Long = {
val path = new Path(inputPath)
val fs = path.getFileSystem(hadoopConfig)
val input = fs.open(path)
// consistent with c++ library, little-endian in file, convert to big-endian
val num = java.lang.Long.reverseBytes(input.readLong())
fs.close()
return num
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class VertexWriter(prefix: String, vertexInfo: VertexInfo, vertexDf: DataFrame,
case None => vertexDf.count()
case _ => numVertices.get
}
writeVertexNum()

private var chunks:DataFrame = VertexWriter.repartitionAndSort(vertexDf, vertexInfo.getChunk_size(), vertexNum)

Expand Down
4 changes: 4 additions & 0 deletions spark/src/test/scala/com/alibaba/graphar/TestWriter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package com.alibaba.graphar

import com.alibaba.graphar.utils.IndexGenerator
import com.alibaba.graphar.writer.{VertexWriter, EdgeWriter}
import com.alibaba.graphar.utils
acezen marked this conversation as resolved.
Show resolved Hide resolved

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.scalatest.funsuite.AnyFunSuite
Expand Down Expand Up @@ -61,6 +62,9 @@ class WriterSuite extends AnyFunSuite {
val chunk_path = new Path(prefix + vertex_info.getPrefix() + "*/*")
val chunk_files = fs.globStatus(chunk_path)
assert(chunk_files.length == 20)
val vertex_num_path = prefix + vertex_info.getVerticesNumFilePath()
val number = utils.FileSystem.readValue(vertex_num_path, spark.sparkContext.hadoopConfiguration)
assert(number.toInt == vertex_df.count())

assertThrows[IllegalArgumentException](new VertexWriter(prefix, vertex_info, vertex_df))
val invalid_property_group= new PropertyGroup()
Expand Down