Factual · myronahn · Nov 1, 2013 · Nov 8, 2013 · Nov 14, 2013 · Nov 14, 2013
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 * bugfix: [#98](https://github.com/Factual/drake/issues/98) --help now doesn't run workflow (thanks marshallshen)
 * Upgrade to c4 0.2.0, which no longer bundles the Facebook API
+* New feature: temp targets (https://github.com/Factual/drake/issues/88)
 
 ## 0.1.4
 

diff --git a/resources/regtest/regtest_temp.d b/resources/regtest/regtest_temp.d
@@ -0,0 +1,29 @@
+; Regression testing for Drake: checkpointing/temp targets
+; Relevant URLs:
+; https://github.com/Factual/drake/issues/88
+
+~temp1 <- start
+  echo -n $INPUTS > $OUTPUT
+  echo -n "t1" >> stepsrun
+
+~temp2 <- temp1
+  echo -n $INPUTS > $OUTPUT
+  echo -n "t2" >> stepsrun
+
+~temp3, perm1 <- temp2
+  echo -n INPUTS > $OUTPUT0
+  echo -n $INPUTS > $OUTPUT1
+  echo -n "t3p1" >> stepsrun
+
+perm2 <- perm1
+  echo -n $INPUTS > $OUTPUT
+  echo -n "p2" >> stepsrun
+
+perm3 <- temp2
+  echo -n $INPUTS > $OUTPUT
+  echo -n "p3" >> stepsrun
+
+perm4 <- temp3
+  echo -n $INPUTS > $OUTPUT
+  echo -n "p4" >> stepsrun
+
diff --git a/resources/regtest/regtest_temp.sh b/resources/regtest/regtest_temp.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Regression testing for Drake: checkpointing/temp targets
+# Relevant URLs:
+# https://github.com/Factual/drake/issues/88
+
+source $(dirname $0)/regtest_utils.sh
+
+export FILES=start\ stepsrun\ temp1\ temp2\ temp3\ perm1\ perm2\ perm3\ perm4
+
+cleanup() {
+  for FILE in $FILES
+  do
+    rm -f $(dirname $0)/$FILE>/dev/null 2>&1
+  done
+}
+
+checkfiles() {
+check_does_not_exist temp1 temp2 temp3
+check_exists perm1 perm2 perm3 perm4
+}
+
+echo "------------------"
+echo "TESTS: checkpoints"
+echo "------------------"
+
+# First cleanup any existing files
+cleanup
+
+# Run script from scratch, make sure temp files are deleted
+touch start
+run_d regtest_temp.d -a
+check_grep stepsrun "t1t2t3p1p2p3p4"
+checkfiles
+
+# Run script again, make sure nothing happens even though temp files are deleted
+rm stepsrun
+run_d regtest_temp.d -a
+check_does_not_exist stepsrun
+checkfiles
+
+# Touch start, run script again, make sure all files are built
+sleep 2 # get around the 1 second accuracy of lastModified() in Java
+#rm stepsrun
+touch start
+run_d regtest_temp.d -a
+check_grep stepsrun "t1t2t3p1p2p3p4"
+checkfiles
+
+# Remove perm2, make sure only perm2 step is run
+sleep 2 # get around the 1 second accuracy of lastModified() in Java
+rm stepsrun
+rm perm2
+run_d regtest_temp.d -a
+check_grep stepsrun "p2"
+checkfiles
+
+# Touch temp2, make sure temp3, perm1, perm2, perm3, perm4 are built
+sleep 2 # get around the 1 second accuracy of lastModified() in Java
+rm stepsrun
+touch temp2
+run_d regtest_temp.d -a
+check_grep stepsrun "t3p1p2p3p4"
+checkfiles
+
+# Touch temp3, make sure perm4 is run
+sleep 2 # get around the 1 second accuracy of lastModified() in Java
+rm stepsrun
+touch temp3
+run_d regtest_temp.d -a
+check_grep stepsrun "p4"
+checkfiles
+
+# Touch start, build only perm3, make sure temp1, temp2, perm3 are run
+sleep 2 # get around the 1 second accuracy of lastModified() in Java
+rm stepsrun
+touch start
+run_d regtest_temp.d -a perm3
+check_grep stepsrun "t1t2p3"
+checkfiles
+
+# Cleanup, build only temp3, make sure temp1, temp2, temp3, perm1 are run
+# Also make sure temp3 still exists
+cleanup
+touch start
+run_d regtest_temp.d -a temp3 
+check_grep stepsrun "t1t2t3p1"
+check_does_not_exist temp1 temp2
+check_exists perm1 temp3
+
+# Final cleanup 
+cleanup
+
+echo "ALL PASSED"
+
diff --git a/resources/regtest/regtest_utils.sh b/resources/regtest/regtest_utils.sh
@@ -1,6 +1,32 @@
 STDOUT=$(dirname $0)/stdout.log
 STDERR=$(dirname $0)/stderr.log
 
+check_exists() {
+  for file in "$@"
+  do
+    echo "Making sure $file exists"
+    if [ ! -f $file ]; then
+      echo "FAIL"
+      echo "file $file does not exist"
+      exit -1
+    fi
+  done
+  echo "PASS"
+}
+
+check_does_not_exist() {
+  for file in "$@"
+  do
+    echo "Making sure $file does not exist"
+    if [ -f $file ]; then
+      echo "FAIL"
+      echo "file $file exists"
+      exit -1
+    fi
+  done
+  echo "PASS"
+}
+
 check() {
   if [ "$1" != "$2" ]; then
     echo "FAIL"

diff --git a/resources/regtest/run-all.sh b/resources/regtest/run-all.sh
@@ -8,7 +8,8 @@ if ($(dirname $0)/regtest_fs.sh &&
     $(dirname $0)/regtest_inputs_outputs.sh &&
     $(dirname $0)/regtest_methods.sh &&
     $(dirname $0)/regtest_protocol_eval.sh &&
-    $(dirname $0)/regtest_s3.sh); then
+    $(dirname $0)/regtest_s3.sh &&
+    $(dirname $0)/regtest_temp.sh); then
   echo "run-all: ALL TESTS PASSED"
 else
   echo "run-all: SOME TESTS FAILED"

diff --git a/src/drake/core.clj b/src/drake/core.clj
@@ -158,6 +158,47 @@
       :vars vars
       :opts (if-not method opts (merge (method :opts) opts)))))
 
+(defn- data-in?
+  "Shortcut for (fs di/data-in?)"
+  [file]
+  (fs di/data-in? file))
+
+(defn- no-data-in?
+  "Shortcut for (not (fs di/data-in?))"
+  [file]
+  (not (data-in? file)))
+
+(defn- expand-outputs
+  "Given a set of outputs, expand any temp outputs that have no data by
+  recursively replacing the temp output with the outputs of any steps 
+  that depend on that temp output - and then doing the same for the
+  temp outputs of the dependent steps that have no data.  Temp outputs
+  that have data will not be expanded.  This is the key algorithm that
+  allows Drake to deal with deleted temp targets without triggering 
+  unnecessary rebuilds."
+  [parse-tree step-list step]
+  (let [step-set (into #{} step-list)
+        steps (parse-tree :steps)
+        real-outputs (step :real-outputs)
+        temp-outputs (step :temp-outputs)
+        temp-input-map-lookup (parse-tree :temp-input-map-lookup)
+        [existing-temp-outputs empty-temp-outputs] (split-with data-in? temp-outputs)
+        empty-temp-output-step-deps (->> empty-temp-outputs
+                                      (map (comp temp-input-map-lookup normalized-path)) 
+                                      flatten
+                                      (filter step-set)) 
+        expanded-temp-outputs (->> empty-temp-output-step-deps
+                                (map (comp (partial expand-outputs parse-tree step-list) steps))
+                                flatten)]
+    (trace "expand-outputs step:" step)
+    (trace "expand-outputs step-list:" step-list)
+    (trace "expand-outputs existing-temp-outputs:" existing-temp-outputs)
+    (trace "expand-outputs empty-temp-outputs:" empty-temp-outputs)
+    (trace "expand-outputs empty-temp-output-step-deps:" empty-temp-output-step-deps)
+    (trace "expand-outputs expanded-temp-outputs" expanded-temp-outputs)
+
+    (concat real-outputs existing-temp-outputs expanded-temp-outputs)))
+
 (defn- should-build?
   "Given the parse tree and a step index, determines whether it should
    be built and returns the reason (e.g. 'timestamped') or
@@ -174,15 +215,21 @@
                     above). If this step is specified manually (root of a
                     dependency subtree), will always fail on an empty input,
                     since it will not be generated by any other step."
-  [step forced triggered match-type fail-on-empty]
+  [parse-tree step-list step forced triggered match-type fail-on-empty]
   (trace "should-build? fail-on-empty: " fail-on-empty)
-  (let [{:keys [inputs outputs opts]} (branch-adjust-step step false)
-        empty-inputs (filter #(not (fs di/data-in? %)) inputs)
+  (let [{:keys [real-inputs real-outputs temp-inputs temp-outputs opts]} (branch-adjust-step step false)
+        inputs (into real-inputs (filter data-in? temp-inputs))
+        empty-inputs (filter no-data-in? real-inputs)
+        outputs (expand-outputs parse-tree step-list step)
         no-outputs (empty? outputs)]
     (trace "should-build? forced:" forced)
     (trace "should-build? match-type:" match-type)
     (trace "should-build? triggered:" triggered)
+    (trace "should-build? temp-inputs: " temp-inputs)
+    (trace "should-build? real-inputs: " real-inputs)
     (trace "should-build? inputs: " inputs)
+    (trace "should-build? temp-outputs: " temp-outputs)
+    (trace "should-build? real-outputs: " real-outputs)
     (trace "should-build? outputs: " outputs)
     (trace "should-build? empty inputs: " empty-inputs)
     (trace "should-build? no-outputs: " no-outputs)
@@ -200,11 +247,13 @@
        ;; one or more outputs are missing? (can only look for those
        ;; for targets which were specified explicitly, not triggered)
        (and (not triggered)
-            (some #(not (fs di/data-in? %)) outputs)) "missing output"
+            (some no-data-in? outputs)) "missing output"
        ;; timecheck disabled
        (= false (get-in step [:opts :timecheck])) nil
        ;; built as a dependency?
        triggered "projected timestamped"
+       ;; if there are nothing but temp inputs, don't build
+       (and (empty? inputs) (not (empty? temp-inputs))) nil
        ;; no-input steps are always rebuilt
        (empty? inputs) "no-input step"
        :else
@@ -233,7 +282,8 @@
    (reduce (fn [[new-target-steps triggered-deps]
                 {:keys [index build match-type] :as step}]
              (let [step-map ((parse-tree :steps) index)
-                   cause (should-build? step-map (= build :forced)
+                   step-list (map :index target-steps)
+                   cause (should-build? parse-tree step-list step-map (= build :forced)
                                         (triggered-deps index) match-type false)]
                (trace (format "predict-steps, index=%d, cause=%s" index cause))
                (if (nil? cause)
@@ -302,7 +352,7 @@
 (defn- run-step
   "Runs one step performing all necessary checks, returns
    true if the step was actually run; false if skipped."
-  [parse-tree step-number {:keys [index build match-type opts]}]
+  [parse-tree steps step-number {:keys [index build match-type opts]}]
   (let [step ((parse-tree :steps) index)
         inputs (step :inputs)]
     ;; TODO(artem)
@@ -318,7 +368,8 @@
     (let [step-descr (step-string (branch-adjust-step step false))
           step (update-in step [:opts] #(merge % opts))
           step (prepare-step-for-run step parse-tree)
-          should-build (should-build? step (= build :forced)
+          step-list (map :index steps)
+          should-build (should-build? parse-tree step-list step (= build :forced)
                                       false match-type true)]
       (info "")
       (info (format "--- %d. %s: %s"
@@ -463,14 +514,14 @@
   (add-empty-promises-to-steps steps :exception-promise))
 
 (defn- attempt-run-step 
-  [parse-tree step]
+  [parse-tree steps step]
   (let [prom (:promise step)]
     (try
       ; run the step (the actual job)
-      (run-step parse-tree (:index step) step)
+      (run-step parse-tree steps (:index step) step)
       (deliver prom 1) ; delivers a promise of 1/success  
       (catch Exception e 
-        (error (str "caught exception step " (:index step) ": ") (.getMessage e) (.printStackTrace e))
+        (error e (str "caught exception step " (:index step) ": " e))
         (deliver (:exception-promise step) e))
       (finally
         ; if promise not delivered, deliver a promise of 0/failure
@@ -492,7 +543,7 @@
                                                      @(promises-indexed i)) 
                                                    deps))]
           (if (= successful-parent-steps (count deps))
-            (attempt-run-step parse-tree step)
+            (attempt-run-step parse-tree steps step)
             (deliver prom 0)))
         (catch Exception e
           (deliver (:exception-promise step) e))
@@ -553,6 +604,40 @@
   [steps]
   (reduce + (map (fn [step] @(:promise step)) steps)))
 
+(defn- setup-temp-deleting-futures
+  "Set up a future for each temp file which waits for all
+  the steps which depend on this temp file to finish and
+  then deletes the temp file."
+  [parse-tree steps]
+  (let [steps-list (map :index steps)
+        steps-map (zipmap steps-list steps)
+        steps-set (into #{} steps-list)]
+    (doseq [[file deps] (parse-tree :temp-input-map-lookup)]
+      (let [trimmed-deps (filter steps-set deps)]
+        (when (not (empty? trimmed-deps))
+          (future 
+            (try
+              (trace "Running future to delete target:" file "dependencies:" trimmed-deps) 
+              (let [successful-deps-count (reduce + 
+                                                  (map (fn [i] 
+                                                         @((steps-map i) :promise)) 
+                                                       trimmed-deps))] 
+                (trace "Finished waiting for dependents of target:" 
+                       file 
+                       "dependencies:" 
+                       trimmed-deps 
+                       "successful-count:" 
+                       successful-deps-count)
+                (when (= successful-deps-count (count trimmed-deps))
+                  (info "Deleting temp target:" file)
+                  (fs di/rm file)))
+              (catch Exception e 
+                ; Likely to happen if there is some problem with deleting the file.
+                ; Catch the exception and inform the user of the problem,
+                ; but do not hald execution as deletion of the file is probably not 
+                ; critical to the workflow.
+                (error e "Exception deleting temp target:" file)))))))))
+
 (defn- run-steps-async 
   "Runs steps asynchronously.
    If concurrence = 1, this will run the steps in the same order as the
@@ -581,6 +666,9 @@
                              assoc-promise
                              (assoc-function parse-tree))]
 
+          (when (not (:keep-temp-targets *options*))
+            (setup-temp-deleting-futures parse-tree steps-future)) 
+
           (post event-bus (EventWorkflowBegin steps-data))  
 
           (trigger-futures jobs steps-future event-bus)
@@ -911,6 +999,8 @@
                      "Turn on even more verbose debugging output.")
                    (no-arg version
                      "Show version information.")
+                   (no-arg keep-temp-targets
+                     "Do not auto-delete temp targets")
                    (with-arg tmpdir
                        "Specifies the temporary directory for Drake files (by default, .drake/ in the same directory the main workflow file is located)."
                        :type :str
@@ -958,8 +1048,7 @@
          (error (str "drake: " (m :msg)))
          (shutdown (or (get m :exit) 1)))
        (catch Exception e
-         (.printStackTrace e)
-         (error (stack-trace-str e))
+         (error e (str "Exception caught: " e))
          (shutdown 1))))))
 
 (defn run-opts [opts]