Skip to content

Improve the tail call identification logic to prefer local jumps unless the target is a known function entry #296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions base/src/Data/Macaw/Discovery.hs
Original file line number Diff line number Diff line change
Expand Up @@ -497,24 +497,22 @@ useExternalTargets bcc = do
-- new classifiers (e.g., classifiers that can produce architecture-specific
-- terminators).
--
-- Note that the direct jump classifier is last; this is to give the other
-- classifiers a chance to run and match code. In particular, tail calls and
-- direct local jumps are very difficult to distinguish from each other. Since
-- we have to choose some order between them, we put the tail call classifier
-- first so that it at least *could* fire without being completely subsumed by
-- direct jumps. This means that some control flow transfers that look like
-- direct jumps could instead be classified as tail calls. It would take some
-- higher-level heuristics (e.g., live registers at the call site, locality) to
-- distinguish the cases.
-- Note that classifiers are an instance of 'Control.Monad.Alternative', so the
-- order they are applied in matters. While several are non-overlapping, at
-- least the order that the direct jump and tail call classifiers are applied in
-- matters, as they look substantially the same to the analysis. Being too eager
-- to flag jumps as tail calls classifies the jump targets as known function
-- entry points, which can interfere with other classifiers later in the
-- function.
defaultClassifier :: BlockClassifier arch ids
defaultClassifier = branchClassifier
<|> noreturnCallClassifier
<|> callClassifier
<|> returnClassifier
<|> jumpTableClassifier
<|> pltStubClassifier
<|> tailCallClassifier
<|> directJumpClassifier
<|> tailCallClassifier

-- | This parses a block that ended with a fetch and execute instruction.
parseFetchAndExecute :: forall arch ids
Expand Down
22 changes: 20 additions & 2 deletions base/src/Data/Macaw/Discovery/Classifier.hs
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,21 @@ returnClassifier = classifierName "Return" $ do
}

-- | Classifies jumps to concrete addresses as unconditional jumps. Note that
-- this logic is substantially similar to the 'callClassifier'; as such, this
-- classifier should always be applied *after* the 'callClassifier'.
-- this logic is substantially similar to the 'tailCallClassifier' in cases
-- where the function does not establish a stack frame (i.e., leaf functions).
--
-- Note that known call targets are not eligible to be intra-procedural jump
-- targets (see 'classifyDirectJump'). This means that we need to conservatively
-- prefer to mis-classify terminators as jumps rather than tail calls. The
-- downside of this choice is that code that could be considered a tail-called
-- function may be duplicated in some cases (i.e., considered part of multiple
-- functions).
--
-- The alternative interpretation (eagerly preferring tail calls) can cause a
-- section of a function to be marked as a tail-called function, thereby
-- blocking the 'directJumpClassifier' or the 'branchClassifier' from
-- recognizing the "callee" as an intra-procedural jump. This results in
-- classification failures that we don't have any mitigations for.
directJumpClassifier :: BlockClassifier arch ids
directJumpClassifier = classifierName "Jump" $ do
bcc <- CMR.ask
Expand Down Expand Up @@ -380,6 +393,11 @@ noreturnCallClassifier = classifierName "no return call" $ do
--
-- The current heuristic is that the target looks like a call, except the stack
-- height in the caller is 0.
--
-- Note that, in leaf functions (i.e., with no stack usage), tail calls and
-- jumps look substantially similar. We typically apply the jump classifier
-- first to prefer them, which means that we very rarely recognize tail calls in
-- leaf functions.
tailCallClassifier :: BlockClassifier arch ids
tailCallClassifier = classifierName "Tail call" $ do
bcc <- CMR.ask
Expand Down
3 changes: 1 addition & 2 deletions macaw-aarch32/tests/arm/syscall-a32.mcw.expected
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
R { funcs = [ (0x10054, [ (0x10054, 32), (0x10074, 12), (0x10080, 4) ])
, (0x10084, [ (0x10084, 24), (0x1009c, 24) ])
R { funcs = [ (0x10054, [ (0x10054, 32), (0x10074, 12), (0x10080, 4), (0x10084, 24), (0x1009c, 24) ])
]
, ignoreBlocks = []
}
1 change: 1 addition & 0 deletions x86/macaw-x86.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ test-suite macaw-x86-tests
lens,
macaw-base,
parameterized-utils,
prettyprinter,
macaw-x86,
temporary,
tasty,
Expand Down
46 changes: 40 additions & 6 deletions x86/tests/ElfX64Linux.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,47 @@
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE DataKinds #-}
module ElfX64Linux (
elfX64LinuxTests
SaveMacaw(..)
, elfX64LinuxTests
) where

import Control.Lens ( (^.) )
import Control.Monad ( unless, when )
import qualified Control.Monad.Catch as C
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BSC
import qualified Data.Foldable as F
import qualified Data.Map as M
import Data.Maybe
import qualified Data.Set as S
import Data.Typeable ( Typeable )
import Data.Word ( Word64 )
import Numeric (showHex)
import qualified Prettyprinter as PP
import System.FilePath
import qualified Test.Tasty as T
import qualified Test.Tasty.HUnit as T
import qualified Test.Tasty.Options as TTO
import Text.Printf ( printf )
import Text.Read ( readMaybe )

import qualified Data.ElfEdit as E

import qualified Data.Parameterized.Some as PU
import qualified Data.Macaw.CFG as MC
import qualified Data.Macaw.Discovery as MD
import qualified Data.Macaw.Memory as MM
import qualified Data.Macaw.Memory.ElfLoader as MM
import qualified Data.Macaw.Discovery as MD
import qualified Data.Macaw.X86 as RO
import qualified Data.Parameterized.Some as PU

data SaveMacaw = SaveMacaw Bool

instance TTO.IsOption SaveMacaw where
defaultValue = SaveMacaw False
parseValue v = SaveMacaw <$> TTO.safeReadBool v
optionName = pure "save-macaw"
optionHelp = pure "Save Macaw IR for each test case to /tmp for debugging"


-- |
elfX64LinuxTests :: [FilePath] -> T.TestTree
Expand Down Expand Up @@ -60,7 +74,7 @@ data ExpectedResult =

-- | Given a path to an expected file, this
mkTest :: FilePath -> T.TestTree
mkTest fp = T.testCase fp $ withELF elfFilename (testDiscovery fp)
mkTest fp = T.askOption $ \saveMacaw@(SaveMacaw _) -> T.testCase fp $ withELF elfFilename (testDiscovery saveMacaw fp)
where
elfFilename = dropExtension fp

Expand All @@ -81,10 +95,24 @@ toAddr segOff = do
addr = MM.segoffAddr segOff
in Addr (fromIntegral (MM.addrBase addr)) (fromIntegral (MM.addrOffset addr))

escapeSlash :: Char -> Char
escapeSlash '/' = '_'
escapeSlash c = c

toSavedMacawPath :: String -> FilePath
toSavedMacawPath testName = "/tmp" </> name <.> "macaw"
where
name = fmap escapeSlash testName

writeMacawIR :: (MC.ArchConstraints arch) => SaveMacaw -> String -> MD.DiscoveryFunInfo arch ids -> IO ()
writeMacawIR (SaveMacaw sm) name dfi
| not sm = return ()
| otherwise = writeFile (toSavedMacawPath name) (show (PP.pretty dfi))

-- | Run a test over a given expected result filename and the ELF file
-- associated with it
testDiscovery :: FilePath -> E.ElfHeaderInfo 64 -> IO ()
testDiscovery expectedFilename elf = do
testDiscovery :: SaveMacaw -> FilePath -> E.ElfHeaderInfo 64 -> IO ()
testDiscovery saveMacaw expectedFilename elf = do
let opt = MM.defaultLoadOptions
(warn, mem, mentry, syms) <-
case MM.resolveElfContents opt elf of
Expand All @@ -98,6 +126,12 @@ testDiscovery expectedFilename elf = do
| sym <- syms
]
let di = MD.cfgFromAddrs RO.x86_64_linux_info mem addrSymMap entries []

let testName = takeFileName expectedFilename
F.forM_ (di ^. MD.funInfo) $ \(PU.Some dfi) -> do
let funcFileName = testName <.> BSC.unpack (MD.discoveredFunName dfi)
writeMacawIR saveMacaw funcFileName dfi

expectedString <- readFile expectedFilename
case readMaybe expectedString of
Nothing -> T.assertFailure ("Invalid expected result: " ++ show expectedString)
Expand Down
13 changes: 11 additions & 2 deletions x86/tests/Main.hs
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
{-# LANGUAGE TypeApplications #-}
module Main ( main ) where

import System.FilePath.Glob ( namesMatching )
import Data.Proxy ( Proxy(..) )
import System.FilePath.Glob ( namesMatching )
import qualified Test.Tasty as T
import qualified Test.Tasty.Options as TTO
import qualified Test.Tasty.Runners as TTR

import qualified ElfX64Linux as T


ingredients :: [TTR.Ingredient]
ingredients = T.includingOptions [ TTO.Option (Proxy @T.SaveMacaw)
] : T.defaultIngredients

main :: IO ()
main = do
x64AsmTests <- namesMatching "tests/x64/*.expected"
T.defaultMain $ T.testGroup "ReoptTests" [
T.defaultMainWithIngredients ingredients $ T.testGroup "ReoptTests" [
T.elfX64LinuxTests x64AsmTests
]
Binary file added x86/tests/x64/inner_loop_31.exe
Binary file not shown.
3 changes: 3 additions & 0 deletions x86/tests/x64/inner_loop_31.exe.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
R { funcs = [(Addr 0 0x401000, [(Addr 0 0x401000, 34), (Addr 0 0x401022, 76), (Addr 0 0x40106e, 31)])]
, ignoreBlocks = []
}
38 changes: 38 additions & 0 deletions x86/tests/x64/inner_loop_31.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
main:
endbr64
movabs $0x7fffffff80000000,%rcx
movabs $0x800000007fffffff,%r13
movabs $0x7fffffff7fffffff,%r15
target:
cmp %r10,%r8
mov %r8,%rax
mov %r10,%rbx
mov %rcx,%rbp
mov %r13,%r14
cmovb %r10,%r8
cmovb %rax,%r10
cmovb %r13,%rcx
cmovb %rbp,%r13
sub %r10,%r8
sub %r13,%rcx
add %r15,%rcx
test $0x1,%rax
cmove %rax,%r8
cmove %rbx,%r10
cmove %rbp,%rcx
cmove %r14,%r13
shr %r8
add %r13,%r13
sub %r15,%r13
sub $0x1,%edi
jne target
shr $0x20,%r15
mov %ecx,%edx
mov %r13d,%r12d
shr $0x20,%rcx
shr $0x20,%r13
sub %r15,%rdx
sub %r15,%rcx
sub %r15,%r12
sub %r15,%r13
repz retq
3 changes: 1 addition & 2 deletions x86/tests/x64/test-bitmask-rsp.exe.expected
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
R { funcs = [ (Addr 0 0x401000, [(Addr 0 0x401000,33)])
, (Addr 0 0x401021, [(Addr 0 0x401021, 2)])
R { funcs = [ (Addr 0 0x401000, [(Addr 0 0x401000,33), (Addr 0 0x401021, 2)])
]
, ignoreBlocks = []
}
1 change: 0 additions & 1 deletion x86/tests/x64/test-plt.exe.expected
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ R { funcs =
, (Addr 1 0x610, [(Addr 1 0x610,27),(Addr 1 0x62b,12),(Addr 1 0x637,3),(Addr 1 0x640,2)])
, (Addr 1 0x650, [(Addr 1 0x650,40),(Addr 1 0x678,12),(Addr 1 0x684,3),(Addr 1 0x690,2)])
, (Addr 1 0x6a0, [(Addr 1 0x6a0,9),(Addr 1 0x6a9,14),(Addr 1 0x6b7,12),(Addr 1 0x6c3,5),(Addr 1 0x6c8,8), (Addr 1 0x6d0,2)])
, (Addr 1 0x6d0, [(Addr 1 0x6d0,2)])
, (Addr 1 0x6e0, [(Addr 1 0x6e0,13), (Addr 1 0x6ed,5), (Addr 1 0x6f8, 12), (Addr 1 0x704, 6), (Addr 1 0x70a, 6)])
, (Addr 1 0x710, [(Addr 1 0x710,13),(Addr 1 0x71d,4)])
, (Addr 1 0x730, [(Addr 1 0x730,49),(Addr 1 0x761,5),(Addr 1 0x766,10),(Addr 1 0x770,13),(Addr 1
Expand Down
3 changes: 1 addition & 2 deletions x86/tests/x64/test-tail-call.exe.expected
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
R { funcs = [ (Addr 1 0x2c0, [(Addr 1 0x2c0, 7)])
, (Addr 1 0x2d0, [(Addr 1 0x2d0, 11)])
, (Addr 1 0x2e0, [(Addr 1 0x2e0, 11), (Addr 1 0x2eb, 16)])
, (Addr 1 0x2fb, [(Addr 1 0x2fb, 1)])
, (Addr 1 0x2e0, [(Addr 1 0x2e0, 11), (Addr 1 0x2eb, 16), (Addr 1 0x2fb, 1)])
]
, ignoreBlocks = []
}