linkedin · akshayrai · Apr 12, 2017 · Mar 21, 2017 · Mar 22, 2017 · Mar 22, 2017
diff --git a/app/com/linkedin/drelephant/spark/fetchers/SparkFetcher.scala b/app/com/linkedin/drelephant/spark/fetchers/SparkFetcher.scala
@@ -19,12 +19,11 @@ package com.linkedin.drelephant.spark.fetchers
 import scala.async.Async
 import scala.concurrent.{Await, ExecutionContext, Future}
 import scala.concurrent.duration.{Duration, SECONDS}
-import scala.util.Try
 import scala.util.control.NonFatal
 
 import com.linkedin.drelephant.analysis.{AnalyticJob, ElephantFetcher}
 import com.linkedin.drelephant.configurations.fetcher.FetcherConfigurationData
-import com.linkedin.drelephant.spark.data.{SparkApplicationData, SparkLogDerivedData, SparkRestDerivedData}
+import com.linkedin.drelephant.spark.data.SparkApplicationData
 import com.linkedin.drelephant.util.SparkUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.log4j.Logger
@@ -61,11 +60,18 @@ class SparkFetcher(fetcherConfigurationData: FetcherConfigurationData)
     if (eventLogEnabled) Some(new SparkLogClient(hadoopConfiguration, sparkConf)) else None
   }
 
+  private[fetchers] lazy val useRestForLogs: Boolean = {
+    fetcherConfigurationData.getParamMap
+        .getOrDefault("use_rest_for_eventlogs", "false")
+        .toBoolean
+  }
+
   override def fetchData(analyticJob: AnalyticJob): SparkApplicationData = {
     val appId = analyticJob.getAppId
     logger.info(s"Fetching data for ${appId}")
     try {
-      Await.result(doFetchData(sparkRestClient, sparkLogClient, appId), DEFAULT_TIMEOUT)
+      Await.result(doFetchData(sparkRestClient, sparkLogClient, appId, useRestForLogs),
+        DEFAULT_TIMEOUT)
     } catch {
       case NonFatal(e) =>
         logger.error(s"Failed fetching data for ${appId}", e)
@@ -83,17 +89,22 @@ object SparkFetcher {
   private def doFetchData(
     sparkRestClient: SparkRestClient,
     sparkLogClient: Option[SparkLogClient],
-    appId: String
+    appId: String,
+    fetchLogsViaRest: Boolean
   )(
     implicit ec: ExecutionContext
   ): Future[SparkApplicationData] = async {
-    val restDerivedData = await(sparkRestClient.fetchData(appId))
+    val restDerivedData = await(sparkRestClient.fetchRestData(appId))
     val lastAttemptId = restDerivedData.applicationInfo.attempts.maxBy { _.startTime }.attemptId
-
-    // Would use .map but await doesn't like that construction.
-    val logDerivedData = sparkLogClient match {
-      case Some(sparkLogClient) => Some(await(sparkLogClient.fetchData(appId, lastAttemptId)))
-      case None => None
+    val logDerivedData = if (fetchLogsViaRest) {
+      await(sparkRestClient.fetchLogData(appId, lastAttemptId))
+    } else {
+      // Would use .map but await doesn't like that construction.
+      sparkLogClient match {
+        case Some(sparkLogClient) =>
+          Some(await(sparkLogClient.fetchData(appId, lastAttemptId)))
+        case None => None
+      }
     }
 
     SparkApplicationData(appId, restDerivedData, logDerivedData)

diff --git a/app/com/linkedin/drelephant/spark/fetchers/SparkLogClient.scala b/app/com/linkedin/drelephant/spark/fetchers/SparkLogClient.scala
@@ -76,7 +76,7 @@ class SparkLogClient(hadoopConfiguration: Configuration, sparkConf: SparkConf) {
     val logPath = getLogPath(webhdfsEventLogUri, appId, attemptId, compressionCodecShortName)
     logger.info(s"looking for logs at ${logPath}")
 
-    val codec = compressionCodecForLogPath(sparkConf, logPath)
+    val codec = compressionCodecForLogName(sparkConf, logPath.getName)
 
     // Limit scope of async.
     async {
@@ -189,10 +189,10 @@ object SparkLogClient {
     new BufferedInputStream(fs.open(logPath))
   }
 
-  private def compressionCodecForLogPath(conf: SparkConf, logPath: Path): Option[CompressionCodec] = {
+  private[fetchers] def compressionCodecForLogName(conf: SparkConf, logName: String): Option[CompressionCodec] = {
     // Compression codec is encoded as an extension, e.g. app_123.lzf
     // Since we sanitize the app ID to not include periods, it is safe to split on it
-    val logBaseName = logPath.getName.stripSuffix(IN_PROGRESS)
+    val logBaseName = logName.stripSuffix(IN_PROGRESS)
     logBaseName.split("\\.").tail.lastOption.map { codecName =>
       compressionCodecMap.getOrElseUpdate(codecName, loadCompressionCodec(conf, codecName))
     }

diff --git a/app/com/linkedin/drelephant/spark/fetchers/SparkRestClient.scala b/app/com/linkedin/drelephant/spark/fetchers/SparkRestClient.scala
@@ -16,24 +16,28 @@
 
 package com.linkedin.drelephant.spark.fetchers
 
+import java.io.BufferedInputStream
 import java.net.URI
 import java.text.SimpleDateFormat
+import java.util.zip.{ZipEntry, ZipInputStream}
 import java.util.{Calendar, SimpleTimeZone}
 
 import scala.async.Async
 import scala.concurrent.{ExecutionContext, Future}
 import scala.util.control.NonFatal
-
 import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
-import com.linkedin.drelephant.spark.data.SparkRestDerivedData
-import com.linkedin.drelephant.spark.fetchers.statusapiv1.{ApplicationAttemptInfo, ApplicationInfo, ExecutorSummary, JobData, StageData}
+import com.linkedin.drelephant.spark.data.{SparkLogDerivedData, SparkRestDerivedData}
+import com.linkedin.drelephant.spark.fetchers.statusapiv1.{ApplicationInfo, ExecutorSummary, JobData, StageData}
 import javax.ws.rs.client.{Client, ClientBuilder, WebTarget}
 import javax.ws.rs.core.MediaType
+
+import org.apache.hadoop.fs.Path
 import org.apache.log4j.Logger
 import org.apache.spark.SparkConf
 
+import scala.collection.mutable
 
 /**
   * A client for getting data from the Spark monitoring REST API, e.g. <https://spark.apache.org/docs/1.4.1/monitoring.html#rest-api>.
@@ -67,16 +71,16 @@ class SparkRestClient(sparkConf: SparkConf) {
 
   private val apiTarget: WebTarget = client.target(historyServerUri).path(API_V1_MOUNT_PATH)
 
-  def fetchData(appId: String)(implicit ec: ExecutionContext): Future[SparkRestDerivedData] = {
+  def fetchRestData(appId: String)(implicit ec: ExecutionContext): Future[SparkRestDerivedData] = {
     val appTarget = apiTarget.path(s"applications/${appId}")
     logger.info(s"calling REST API at ${appTarget.getUri}")
 
     val applicationInfo = getApplicationInfo(appTarget)
 
     // Limit scope of async.
+    val lastAttemptId = applicationInfo.attempts.maxBy {_.startTime}.attemptId
+    val attemptTarget = lastAttemptId.map(appTarget.path).getOrElse(appTarget)
     async {
-      val lastAttemptId = applicationInfo.attempts.maxBy {_.startTime}.attemptId
-      val attemptTarget = lastAttemptId.map(appTarget.path).getOrElse(appTarget)
       val futureJobDatas = async { getJobDatas(attemptTarget) }
       val futureStageDatas = async { getStageDatas(attemptTarget) }
       val futureExecutorSummaries = async { getExecutorSummaries(attemptTarget) }
@@ -89,6 +93,34 @@ class SparkRestClient(sparkConf: SparkConf) {
     }
   }
 
+  def fetchLogData(appId: String, attemptId: Option[String])(
+      implicit ec: ExecutionContext
+  ): Future[Option[SparkLogDerivedData]] = {
+    val appTarget = apiTarget.path(s"applications/${appId}")
+    logger.info(s"calling REST API at ${appTarget.getUri}")
+
+    val logPrefix = attemptId.map(id => s"${appId}_$id").getOrElse(appId)
+    async {
+      resource.managed { getApplicationLogs(appTarget) }.acquireAndGet { zis =>
+        var entry: ZipEntry = null
+        do {
+          zis.closeEntry()
+          entry = zis.getNextEntry
+        } while (!(entry == null || entry.getName.startsWith(logPrefix)))
+
+        if (entry == null) {
+          logger.warn(
+            s"failed to resolve log starting with $logPrefix for ${appTarget.getUri}")
+          None
+        } else {
+          val codec = SparkLogClient.compressionCodecForLogName(sparkConf, entry.getName)
+          Some(SparkLogClient.findDerivedData(
+            codec.map { _.compressedInputStream(zis) }.getOrElse(zis)))
+        }
+      }
+    }
+  }
+
   private def getApplicationInfo(appTarget: WebTarget): ApplicationInfo = {
     try {
       get(appTarget, SparkRestObjectMapper.readValue[ApplicationInfo])
@@ -100,6 +132,20 @@ class SparkRestClient(sparkConf: SparkConf) {
     }
   }
 
+  private def getApplicationLogs(appTarget: WebTarget): ZipInputStream = {
+    val target = appTarget.path("logs")
+    try {
+      val is = target.request(MediaType.APPLICATION_OCTET_STREAM)
+          .get(classOf[java.io.InputStream])
+      new ZipInputStream(new BufferedInputStream(is))
+    } catch {
+      case NonFatal(e) => {
+        logger.error(s"error reading logs ${target.getUri}", e)
+        throw e
+      }
+    }
+  }
+
   private def getJobDatas(attemptTarget: WebTarget): Seq[JobData] = {
     val target = attemptTarget.path("jobs")
     try {

diff --git a/test/com/linkedin/drelephant/spark/fetchers/SparkFetcherTest.scala b/test/com/linkedin/drelephant/spark/fetchers/SparkFetcherTest.scala
@@ -180,7 +180,7 @@ object SparkFetcherTest {
     implicit ec: ExecutionContext
   ): SparkRestClient = {
     val sparkRestClient = Mockito.mock(classOf[SparkRestClient])
-    Mockito.when(sparkRestClient.fetchData(appId)).thenReturn(restDerivedData)
+    Mockito.when(sparkRestClient.fetchRestData(appId)).thenReturn(restDerivedData)
     sparkRestClient
   }
 

diff --git a/test/com/linkedin/drelephant/spark/fetchers/SparkRestClientTest.scala b/test/com/linkedin/drelephant/spark/fetchers/SparkRestClientTest.scala
@@ -67,7 +67,7 @@ class SparkRestClientTest extends AsyncFunSpec with Matchers {
       val sparkConf = new SparkConf().set("spark.yarn.historyServer.address", s"${historyServerUri.getHost}:${historyServerUri.getPort}")
       val sparkRestClient = new SparkRestClient(sparkConf)
 
-      sparkRestClient.fetchData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
+      sparkRestClient.fetchRestData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
         restDerivedData.applicationInfo.id should be(FetchClusterModeDataFixtures.APP_ID)
         restDerivedData.applicationInfo.name should be(FetchClusterModeDataFixtures.APP_NAME)
         restDerivedData.jobDatas should not be(None)
@@ -101,7 +101,7 @@ class SparkRestClientTest extends AsyncFunSpec with Matchers {
       val sparkConf = new SparkConf().set("spark.yarn.historyServer.address", s"${historyServerUri.getHost}:${historyServerUri.getPort}")
       val sparkRestClient = new SparkRestClient(sparkConf)
 
-      sparkRestClient.fetchData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
+      sparkRestClient.fetchRestData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
         restDerivedData.applicationInfo.id should be(FetchClusterModeDataFixtures.APP_ID)
         restDerivedData.applicationInfo.name should be(FetchClusterModeDataFixtures.APP_NAME)
         restDerivedData.jobDatas should not be(None)
@@ -135,7 +135,7 @@ class SparkRestClientTest extends AsyncFunSpec with Matchers {
       val sparkConf = new SparkConf().set("spark.yarn.historyServer.address", s"http://${historyServerUri.getHost}:${historyServerUri.getPort}")
       val sparkRestClient = new SparkRestClient(sparkConf)
 
-      sparkRestClient.fetchData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
+      sparkRestClient.fetchRestData(FetchClusterModeDataFixtures.APP_ID) map { restDerivedData =>
         restDerivedData.applicationInfo.id should be(FetchClusterModeDataFixtures.APP_ID)
         restDerivedData.applicationInfo.name should be(FetchClusterModeDataFixtures.APP_NAME)
         restDerivedData.jobDatas should not be(None)