From bfa8c30690b20de463cdbd5e172c3eaeabebcce4 Mon Sep 17 00:00:00 2001 From: Ryan Scott Date: Tue, 2 Feb 2021 11:14:19 -0500 Subject: [PATCH] WIP: Lang.JVM.Codebase: support loading JIMAGE files This allows `crucible-jvm` to deal with JDK 9 or later, which packages its standard library not in a JAR file, but in a JIMAGE file. Extracting `.class` files from JIMAGE files proves to be surprisingly tricky, and I've carefully documented the intricacies of doing so in `Note [Loading classes from JIMAGE files]` in `Lang.JVM.Codebase`. This is part of a fix for GaloisInc/saw-script#861. --- crucible-jvm/crucible-jvm.cabal | 6 +- crucible-jvm/src/Lang/JVM/Codebase.hs | 237 ++++++++++++++++++++-- crucible-jvm/src/Lang/JVM/JavaTools.hs | 21 +- crucible-jvm/src/Lang/JVM/ProcessUtils.hs | 30 +++ crucible-jvm/tool/Main.hs | 2 +- 5 files changed, 254 insertions(+), 42 deletions(-) create mode 100644 crucible-jvm/src/Lang/JVM/ProcessUtils.hs diff --git a/crucible-jvm/crucible-jvm.cabal b/crucible-jvm/crucible-jvm.cabal index de4bb6cfd..00a154064 100644 --- a/crucible-jvm/crucible-jvm.cabal +++ b/crucible-jvm/crucible-jvm.cabal @@ -53,10 +53,13 @@ library base >= 4 && < 5, aig, array, + base16-bytestring >= 0.1 && < 1.1, bv-sized >= 1.0.0, + bytestring, containers, crucible, - directory, + cryptohash-sha1 >= 0.11 && < 0.12, + directory >= 1.2.5, extra >= 1.6.4, filepath, haskeline >= 0.7, @@ -88,6 +91,7 @@ library Lang.Crucible.JVM.Overrides Lang.JVM.Codebase Lang.JVM.JavaTools + Lang.JVM.ProcessUtils other-modules: ghc-options: -Wall -Wcompat -Werror=incomplete-patterns -Werror=missing-methods -Werror=overlapping-patterns diff --git a/crucible-jvm/src/Lang/JVM/Codebase.hs b/crucible-jvm/src/Lang/JVM/Codebase.hs index 5890f888a..b977427b7 100644 --- a/crucible-jvm/src/Lang/JVM/Codebase.hs +++ b/crucible-jvm/src/Lang/JVM/Codebase.hs @@ -1,6 +1,6 @@ {- | Module : Lang.JVM.Codebase -Description : Codebase manipulation, copied from jvm-verifier package +Description : Java Codebase manipulation License : BSD3 Stability : stable Point-of-contact : jhendrix @@ -27,17 +27,120 @@ module Lang.JVM.Codebase ) where import Control.Monad +import Control.Monad.IO.Class +import Control.Monad.Trans.Maybe (MaybeT(..)) +import Crypto.Hash.SHA1 (hash) +import qualified Data.ByteString.Base16 as B16 +import qualified Data.ByteString.Char8 as B +import Data.Foldable (Foldable(..)) import qualified Data.Map as M import Data.IORef import Data.List (isPrefixOf) import Data.Maybe -import System.Directory (doesFileExist) -import System.FilePath (pathSeparator, (<.>), ()) +import System.Directory +import System.FilePath (pathSeparator, takeDirectory, (<.>), ()) + +import Lang.JVM.JavaTools +import Lang.JVM.ProcessUtils import Language.JVM.Common import Language.JVM.JarReader import Language.JVM.Parser +{- Note [Loading classes from JIMAGE files] + +The Java standard library is packaged differently depending on which version of +the JDK is being used: + +* JDK 8 and earlier ship the standard library in /lib/rt.jar, so + loading the standard library's .class files is simply a matter of parsing + rt.jar. The `jvm-parser` library is used to parse rt.jar like any other JAR + file. +* JDK 9 and later abandon rt.jar and instead distribute the standard library as + a JIMAGE file. (For more on the JIMAGE file format and how it compares to + JAR, see https://stackoverflow.com/a/64202720.) In particular, the standard + library is now located in a JIMAGE file at /lib/modules. + +We need access to the standard library's .class files in order to verify even +the simplest Java programs, so figuring out how to load classes from JIMAGE +files is a requirement for using JDK 9 or later with crucible-jvm. Unlike JAR, +however, the JIMAGE file format is internal to the JDK and subject to change in +the future. As a result, parsing JIMAGE files directly, like `jvm-parser` does +with JAR files, is likely not a good idea. + +If parsing JIMAGE files directly isn't a viable option, what can be done? As +far as I can tell, there are only two reasonable ways of extracting .class +files from JIMAGE files (outside of using Java itself): + +* Use libjimage, a C++ library bundled with the JDK, to load classes. This is + what the JDK itself uses internally, and as a result, it's reasonably space- + and time-efficient to load a single class from a JIMAGE file. Unfortunately, + this is a non-starter for crucible-jvm due to potential linking issues that + would arise when combining GHC with C++ code. + See https://github.com/GaloisInc/crucible/issues/621. + +* Use the jimage utility, shipped alongside java in /bin, to extract + the contents of JIMAGE files. The `jimage extract` command can either be used + to extract everything in a JIMAGE file all at once, or it can be used to + extract one class at a time, like in this example: + + jimage extract --include regex:.*/java/lang/Class.class --dir + +Since libjimage is an option, that only leaves jimage. Unfortunately, jimage is +not without its flaws: + +* jimage is /slow/. It takes about about 1.5 seconds for jimage to extract the + standard `modules` file on a reasonably performant machine. Having an extra + 1.5 seconds of lag every time that crucible-jvm is invoked doesn't sound + appealing. + +* What's worse, jimage is still unreasonably slow even if you only use it to + extract a single class from a JIMAGE file. It takes about 0.25 seconds to + extract java/lang/Class.class from the standard `modules`, which leads me to + suspect that jimage is doing a linear scan through every file in `modules`. + In any case, this is about 14× slower than loading a class file from a JAR, + and if crucible-jvm spends 0.25 seconds every time it loads any class from + `modules`, this would add up very quickly. + +What can we do about jimage's performance issues? One appealing alternative is +to use jlink, yet another utility that the JDK ships. If you give jlink a list +of Java modules, it will produce a minimal runtime environment that includes a +JIMAGE file that only contains the classes needed to support those modules, but +nothing else. In theory, this could cut down on the time it takes to extract +classes from a JIMAGE file, as the standard library's `modules` file is ~136 MB +when compressed (~493 MB when uncompressed), and most Java classes will only +use a small portion of that. + +Unfortunately, jlink is even slower than jimage. It takes jlink about 3 seconds +to create a runtime environment that only contains the `java.base` module, +which cancels out any time saved when extracting the JIMAGE file afterwards. + +Having exhausted all other possibilities, we have concluded that we simply +can't make JIMAGE extraction fast in the general case. We /can/, however, +amortize the cost of using JIMAGE files by caching the extracted contents. +This is exactly what crucible-jvm does. Any time that crucible-jvm tries to +load a class (e.g., java/lang/Class.class), it first checks to see if it is in +a known cache directory +(e.g., ~/.cache/crucible-jvm//java.base/java/lang/Class.class), +and if so, loads it like it would any other .class on the classpath. If not, +it will then consult the JAR path and the classpath. If it is not located by +that point, then crucible-jvm will attempt to extract it from the JIMAGE file, +caching the result for subsequent lookups. Importantly, we perform the JIMAGE +extraction step last, as it is far more time-consuming that trying to load +classes from the JAR path or classpath. + +There's still possibly some room for further optimization here. As mentioned +above, extracting one class at a time from a JIMAGE file takes about 0.25 +seconds, while extracting the entire JIMAGE file takes about 1.5 seconds. +Depending on how many classes a program uses, it may take less time overall +to just extract-and-cache everything from a JIMAGE file at once. The downside, +however, is that the cache would require much more space. A typical JDK +`modules` file takes up ~493 MB of classes when extracted, but most simple Java +programs will only use <1 MB of these classes. As a result, we opt for the +one-class-at-a-time approach, which seems better tuned for the sorts of +Java programs that crucible-jvm verifies in common cases. +-} + -- | Collection of classes loaded by JVM. data CodebaseState = CodebaseState { classMap :: M.Map ClassName Class @@ -46,10 +149,23 @@ data CodebaseState = CodebaseState { -- interfaces to list of classes that directly implement them. } +-- | Contains the path to the @jimage@ tool, the standard @modules@ JIMAGE +-- file, and the path where the extracted contents of @modules@ are cached. +-- +-- This is only used when crucible-jvm uses JDK 9 or later. +-- See Note [Loading classes from JIMAGE files]. +data JimagePaths = JimagePaths + { jimagePath :: FilePath + , standardModulesFilePath :: FilePath + , standardModulesCachePath :: FilePath + } + data Codebase = Codebase { jarReader :: JarReader -- ^ Maps class names to lazily loaded classes in JARs , classPaths :: [FilePath] + , jimagePaths :: Maybe JimagePaths + -- ^ The path to the @jimage@ tool and the paths it works on. , stateRef :: IORef CodebaseState } @@ -58,8 +174,8 @@ instance Show Codebase where -- | Loads Java classes directly beneath given path. Also loads jar indices for -- lazy class loading. -loadCodebase :: [FilePath] -> [FilePath] -> IO Codebase -loadCodebase jarFiles classPaths = do +loadCodebase :: [FilePath] -> [FilePath] -> [FilePath] -> IO Codebase +loadCodebase jarFiles classPaths javaBinDirs = do -- REVISIT: Currently, classes found in the classpath shadow those in the -- jars. Pretty sure the -classpath argument to the vanilla jvm allows -- mixture of jars and directories, and resolves names in the order in which @@ -87,9 +203,16 @@ loadCodebase jarFiles classPaths = do -- lookup in many maps -- one for each classpath component -- we can -- merge the maps as in the current 'JarReader' type, but I doubt -- this would ever matter, performance wise. - jars <- newJarReader jarFiles - let cb = CodebaseState M.empty M.empty - Codebase jars classPaths <$> newIORef cb + jars <- newJarReader jarFiles + mbJimagePaths <- findJimagePaths javaBinDirs + let cb = CodebaseState { classMap = M.empty, subclassMap = M.empty } + cbRef <- newIORef cb + pure $ Codebase + { jarReader = jars + , classPaths = classPaths + , jimagePaths = mbJimagePaths + , stateRef = cbRef + } -- | Register a class with the given codebase addClass :: Class -> CodebaseState -> CodebaseState @@ -107,32 +230,61 @@ addClass cl (CodebaseState cMap scMap) = -- | Returns class with given name in codebase or returns nothing if no class with -- that name can be found. tryLookupClass :: Codebase -> ClassName -> IO (Maybe Class) -tryLookupClass (Codebase{jarReader, classPaths, stateRef}) clNm = do +tryLookupClass (Codebase{jarReader, classPaths, jimagePaths, stateRef}) clNm = do cb <- readIORef stateRef case M.lookup clNm (classMap cb) of Just cl -> return (Just cl) Nothing -> do -- Here we bias our search to JARs before classpath directories, -- as mentioned above in 'loadCodebase'. - let mcls = [loadClassFromJar clNm jarReader] ++ - map (loadClassFromDir clNm) classPaths + let mcls = [ loadClassFromStandardModulesCache clNm jimagePaths + , loadClassFromJar clNm jarReader ] ++ + map (loadClassFromDir clNm) classPaths ++ + [ loadClassFromStandardModulesFile clNm jimagePaths ] mcl <- foldl1 firstSuccess mcls case mcl of Just cl -> do writeIORef stateRef $! addClass cl cb return $ Just cl Nothing -> return Nothing + +-- Search for a .class file in the cache of files extracted from the `modules` +-- JIMAGE file. If it is not found, this does /not/ extract it from a JIMAGE +-- file—see loadClassFromStandardModulesFile. +loadClassFromStandardModulesCache :: ClassName -> Maybe JimagePaths -> IO (Maybe Class) +loadClassFromStandardModulesCache clNm mbJimagePaths = + case mbJimagePaths of + Nothing -> pure Nothing + Just JimagePaths{standardModulesCachePath} -> + loadClassFromParentDir clNm standardModulesCachePath + +-- Attempt to extract a .class file from the `modules` JIMAGE file, cache it +-- for subsequent lookups, and load the resulting .class. +loadClassFromStandardModulesFile :: ClassName -> Maybe JimagePaths -> IO (Maybe Class) +loadClassFromStandardModulesFile clNm mbJimagePaths = + case mbJimagePaths of + Nothing -> pure Nothing + Just JimagePaths{jimagePath, standardModulesFilePath, standardModulesCachePath} -> do + _ <- readProcessExitIfFailure jimagePath + [ "extract" + , "--include", "regex:.*/" ++ escapeDollars (unClassName clNm) ++ ".class" + , "--dir", standardModulesCachePath + , standardModulesFilePath + ] + -- After extracting a single class from a JIMAGE file, why do we then + -- have to search every directory under the cached path again? It's + -- because `jimage extract` doesn't tell you the path of the thing it + -- just extracted. Sigh. We can't reasonably infer what this path is + -- either, as it will start with a module name, which isn't something + -- that is contained in a ClassName. + loadClassFromParentDir clNm standardModulesCachePath where - -- | Combine two @IO (Maybe a)@ computations lazily, choosing the - -- first to succeed (i.e. return 'Just'). - firstSuccess :: IO (Maybe a) -> IO (Maybe a) -> IO (Maybe a) - -- This seems like it would be a common pattern, although I can't - -- find it in the standard libraries. - firstSuccess ima1 ima2 = do - ma1 <- ima1 - case ma1 of - Nothing -> ima2 - Just _ -> return ma1 + -- The '$' character needs to be escaped in the regex that + -- `jimage extract` uses. + escapeDollars :: String -> String + escapeDollars [] = [] + escapeDollars ('$':cs) = '\\':'$':escapeDollars cs + escapeDollars (c:cs) = c:escapeDollars cs -- | Attempt to load a class by searching under directory @dir@, which -- is assumed to be a classpath component. If class @C1. ... .Cn@ is @@ -156,6 +308,38 @@ loadClassFromDir clNm dir = do classNameToClassFilePath clNm = map (\c -> if c == '/' then pathSeparator else c) (unClassName clNm) <.> "class" +-- Given @parentDir@, search through its subdirectories to find a class, +-- loading it if it is found. +loadClassFromParentDir :: ClassName -> FilePath -> IO (Maybe Class) +loadClassFromParentDir clNm parentDir = do + childDirBaseNames <- listDirectory parentDir + let childDirs = map (parentDir ) childDirBaseNames + foldl' firstSuccess (pure Nothing) $ map (loadClassFromDir clNm) childDirs + +-- | Attempt to find an executable named @jimage@, either in the directories +-- supplied as arguments or on the @PATH@. If such an executable can be found, +-- then @Just@ a @JimagePaths@ is returned. Otherwise, @Nothing@ is returned. +-- +-- This is only used when crucible-jvm uses JDK 9 or later. +-- See Note [Loading classes from JIMAGE files]. +findJimagePaths :: [FilePath] -> IO (Maybe JimagePaths) +findJimagePaths javaBinDirs = runMaybeT $ do + javaPath <- MaybeT $ findJavaIn javaBinDirs + javaPathCanonical <- liftIO $ canonicalizePath javaPath -- Remove any symlinks + jimagePath <- MaybeT $ findJimageIn [takeDirectory javaPathCanonical] + javaHome <- MaybeT $ findJavaProperty javaPathCanonical "java.home" + MaybeT $ do + let standardModulesPath = javaHome "lib" "modules" + xdgCacheDir <- getXdgDirectory XdgCache "crucible-jvm" + let standardModulesHash = B16.encode $ hash $ B.pack standardModulesPath + standardModulesCache = xdgCacheDir B.unpack standardModulesHash + createDirectoryIfMissing True standardModulesCache + pure $ Just $ JimagePaths + { jimagePath = jimagePath + , standardModulesFilePath = standardModulesPath + , standardModulesCachePath = standardModulesCache + } + -- | Returns class with given name in codebase or raises error if no class with -- that name can be found. -- @@ -308,3 +492,14 @@ starClosure fn a = a : concatMap (starClosure fn) (fn a) starClosureM :: Monad m => (a -> m [a]) -> a -> m [a] starClosureM fn a = return ((a:) . concat) `ap` (mapM (starClosureM fn) =<< fn a) + +-- | Combine two @IO (Maybe a)@ computations lazily, choosing the +-- first to succeed (i.e. return 'Just'). +firstSuccess :: IO (Maybe a) -> IO (Maybe a) -> IO (Maybe a) +-- This seems like it would be a common pattern, although I can't +-- find it in the standard libraries. +firstSuccess ima1 ima2 = do + ma1 <- ima1 + case ma1 of + Nothing -> ima2 + Just _ -> return ma1 diff --git a/crucible-jvm/src/Lang/JVM/JavaTools.hs b/crucible-jvm/src/Lang/JVM/JavaTools.hs index 760ce0e63..56b51f842 100644 --- a/crucible-jvm/src/Lang/JVM/JavaTools.hs +++ b/crucible-jvm/src/Lang/JVM/JavaTools.hs @@ -13,12 +13,11 @@ module Lang.JVM.JavaTools , findJavaMajorVersion ) where -import Control.Monad (when) import Data.List.Extra (dropPrefix, firstJust, stripInfix) import Data.Maybe import System.Directory -import System.Exit (ExitCode(..)) -import System.Process (readProcessWithExitCode) + +import Lang.JVM.ProcessUtils -- | @'findJavaIn' searchPaths@ searches for an executable named @java@ among -- the directories in @searchPaths@. If @searchPaths@ is empty, then the @PATH@ @@ -92,19 +91,3 @@ findJavaMajorVersion javaPath = do -- leading 1. bit. dropOldJavaVersionPrefix :: String -> String dropOldJavaVersionPrefix = dropPrefix "1." - --- | Invokes @readProcessWithExitCode@ with no standard input, and returns the --- resulting standard output and standard error. If the exit code of the --- process is not 'ExitSuccess', throw an exception with some information that --- may be helpful to debug what went wrong. -readProcessExitIfFailure :: FilePath -> [String] -> IO (String, String) -readProcessExitIfFailure cmd args = do - (ec, out, err) <- readProcessWithExitCode cmd args "" - when (ec /= ExitSuccess) $ - fail $ unlines [ cmd ++ " returned non-zero exit code: " ++ show ec - , "Standard output:" - , out - , "Standard error:" - , err - ] - pure (out, err) diff --git a/crucible-jvm/src/Lang/JVM/ProcessUtils.hs b/crucible-jvm/src/Lang/JVM/ProcessUtils.hs new file mode 100644 index 000000000..1924bdc5f --- /dev/null +++ b/crucible-jvm/src/Lang/JVM/ProcessUtils.hs @@ -0,0 +1,30 @@ +{- | +Module : Lang.JVM.ProcessUtils +Description : Miscellaneous @process@-related utilities. +Copyright : (c) Galois, Inc 2021 +License : BSD3 +Maintainer : rscott@galois.com +Stability : provisional +-} + +module Lang.JVM.ProcessUtils (readProcessExitIfFailure) where + +import Control.Monad (when) +import System.Exit (ExitCode(..)) +import System.Process (readProcessWithExitCode) + +-- | Invokes @readProcessWithExitCode@ with no standard input, and returns the +-- resulting standard output and standard error. If the exit code of the +-- process is not 'ExitSuccess', throw an exception with some information that +-- may be helpful to debug what went wrong. +readProcessExitIfFailure :: FilePath -> [String] -> IO (String, String) +readProcessExitIfFailure cmd args = do + (ec, out, err) <- readProcessWithExitCode cmd args "" + when (ec /= ExitSuccess) $ + fail $ unlines [ cmd ++ " returned non-zero exit code: " ++ show ec + , "Standard output:" + , out + , "Standard error:" + , err + ] + pure (out, err) diff --git a/crucible-jvm/tool/Main.hs b/crucible-jvm/tool/Main.hs index 5100e2d32..a0247b4d3 100644 --- a/crucible-jvm/tool/Main.hs +++ b/crucible-jvm/tool/Main.hs @@ -178,7 +178,7 @@ simulateJVM copts opts = Crux.SimulatorCallback $ \sym _maybeOnline -> do [file] -> return file _ -> fail "crux-jvm requires a single file name as an argument" - cb <- JCB.loadCodebase (jarList opts) (classPath opts) + cb <- JCB.loadCodebase (jarList opts) (classPath opts) (javaBinDirs opts) let cname = takeBaseName file let mname = mainMethod opts