From ae77413e7a1b140245e7a8803cd81c4323273e66 Mon Sep 17 00:00:00 2001
From: Matthieu Martin <ma.tt.b.ma.rt.in+parquet@gmail.com>
Date: Tue, 24 Jun 2014 15:13:53 -0700
Subject: [PATCH] Merging/cherry-picking:

1) Cascading write support (see: https://github.com/apache/incubator-parquet-mr/commit/76bbf4a88645abc657ba6e4c2dc636712f03b944)
2) Caching fix (see: https://github.com/apache/incubator-parquet-mr/pull/2 and https://github.com/matt-martin/incubator-parquet-mr/tree/99bb5a3acdf6c998f5d344bbeaeb446031f47df0)
---
 .../parquet/cascading/ParquetTupleScheme.java |  44 +++--
 .../parquet/cascading/TupleWriteSupport.java  | 103 ++++++++++
 parquet-hadoop/pom.xml                        |   6 +
 .../main/java/parquet/hadoop/LruCache.java    | 181 ++++++++++++++++++
 .../parquet/hadoop/ParquetInputFormat.java    | 143 +++++++++++++-
 .../parquet/hadoop/ParquetOutputFormat.java   |   5 +
 .../java/parquet/hadoop/TestInputFormat.java  |  62 +++++-
 .../java/parquet/hadoop/TestLruCache.java     | 144 ++++++++++++++
 8 files changed, 664 insertions(+), 24 deletions(-)
 create mode 100644 parquet-cascading/src/main/java/parquet/cascading/TupleWriteSupport.java
 create mode 100644 parquet-hadoop/src/main/java/parquet/hadoop/LruCache.java
 create mode 100644 parquet-hadoop/src/test/java/parquet/hadoop/TestLruCache.java
diff --git a/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java b/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java
index d75b6e76d..1f766f431 100644
--- a/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java
+++ b/parquet-cascading/src/main/java/parquet/cascading/ParquetTupleScheme.java
@@ -20,12 +20,10 @@
  import org.apache.hadoop.mapred.JobConf;
  import org.apache.hadoop.mapred.OutputCollector;
  import org.apache.hadoop.mapred.RecordReader;
- import org.apache.hadoop.fs.Path;
 
- import parquet.hadoop.ParquetInputFormat;
  import parquet.hadoop.ParquetFileReader;
+ import parquet.hadoop.ParquetInputFormat;
  import parquet.hadoop.Footer;
- import parquet.hadoop.metadata.ParquetMetadata;
  import parquet.hadoop.mapred.Container;
  import parquet.hadoop.mapred.DeprecatedParquetInputFormat;
  import parquet.schema.MessageType;
@@ -40,6 +38,9 @@
  import cascading.tap.hadoop.Hfs;
  import cascading.tuple.Tuple;
  import cascading.tuple.Fields;
+ import cascading.tuple.TupleEntry;
+ import parquet.hadoop.ParquetOutputFormat;
+ import parquet.hadoop.mapred.DeprecatedParquetOutputFormat;
 
  /**
   * A Cascading Scheme that converts Parquet groups into Cascading tuples.
@@ -56,6 +57,7 @@
 public class ParquetTupleScheme extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]>{
 
   private static final long serialVersionUID = 0L;
+  private String parquetSchema;
 
   public ParquetTupleScheme() {
     super();
@@ -65,6 +67,20 @@ public ParquetTupleScheme(Fields sourceFields) {
     super(sourceFields);
   }
 
+  /**
+   * ParquetTupleScheme constructor used a sink need to be implemented
+   *
+   * @param sourceFields used for the reading step
+   * @param sinkFields used for the writing step
+   * @param schema is mandatory if you add sinkFields and needs to be the
+   * toString() from a MessageType. This value is going to be parsed when the
+   * parquet file will be created.
+   */
+  public ParquetTupleScheme(Fields sourceFields, Fields sinkFields, final String schema) {
+    super(sourceFields, sinkFields);
+    parquetSchema = schema;
+  }
+
   @SuppressWarnings("rawtypes")
   @Override
   public void sourceConfInit(FlowProcess<JobConf> fp,
@@ -122,19 +138,23 @@ public boolean source(FlowProcess<JobConf> fp, SourceCall<Object[], RecordReader
 
   @SuppressWarnings("rawtypes")
   @Override
-  public void sinkConfInit(FlowProcess<JobConf> arg0,
-      Tap<JobConf, RecordReader, OutputCollector> arg1, JobConf arg2) {
-    throw new UnsupportedOperationException("ParquetTupleScheme does not support Sinks");
-
+  public void sinkConfInit(FlowProcess<JobConf> fp,
+          Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
+    jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
+    jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
+    ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
   }
 
   @Override
-  public boolean isSink() { return false; }
-
+  public boolean isSink() {
+    return parquetSchema != null;
+  }
 
   @Override
-  public void sink(FlowProcess<JobConf> arg0, SinkCall<Object[], OutputCollector> arg1)
-      throws IOException {
-    throw new UnsupportedOperationException("ParquetTupleScheme does not support Sinks");
+  public void sink(FlowProcess<JobConf> fp, SinkCall<Object[], OutputCollector> sink)
+          throws IOException {
+    TupleEntry tuple = sink.getOutgoingEntry();
+    OutputCollector outputCollector = sink.getOutput();
+    outputCollector.collect(null, tuple);
   }
 }
\ No newline at end of file
diff --git a/parquet-cascading/src/main/java/parquet/cascading/TupleWriteSupport.java b/parquet-cascading/src/main/java/parquet/cascading/TupleWriteSupport.java
new file mode 100644
index 000000000..6cc13427f
--- /dev/null
+++ b/parquet-cascading/src/main/java/parquet/cascading/TupleWriteSupport.java
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package parquet.cascading;
+
+import cascading.tuple.TupleEntry;
+import java.util.HashMap;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import parquet.hadoop.api.WriteSupport;
+import parquet.io.api.Binary;
+import parquet.io.api.RecordConsumer;
+import parquet.schema.MessageType;
+import parquet.schema.MessageTypeParser;
+import parquet.schema.PrimitiveType;
+import parquet.schema.Type;
+
+/**
+ *
+ *
+ * @author Mickaël Lacour <m.lacour@criteo.com>
+ */
+public class TupleWriteSupport extends WriteSupport<TupleEntry> {
+
+  private RecordConsumer recordConsumer;
+  private MessageType rootSchema;
+  public static final String PARQUET_CASCADING_SCHEMA = "parquet.cascading.schema";
+
+  @Override
+  public WriteContext init(Configuration configuration) {
+    String schema = configuration.get(PARQUET_CASCADING_SCHEMA);
+    rootSchema = MessageTypeParser.parseMessageType(schema);
+    return new WriteContext(rootSchema, new HashMap<String, String>());
+  }
+
+  @Override
+  public void prepareForWrite(RecordConsumer recordConsumer) {
+    this.recordConsumer = recordConsumer;
+  }
+
+  @Override
+  public void write(TupleEntry record) {
+    recordConsumer.startMessage();
+    final List<Type> fields = rootSchema.getFields();
+    int i = 0;
+    for (Type field : fields) {
+      recordConsumer.startField(field.getName(), i);
+      if (field.isPrimitive()) {
+        writePrimitive(record, field.asPrimitiveType());
+      } else {
+        throw new UnsupportedOperationException("Complex type not implemented");
+      }
+      recordConsumer.endField(field.getName(), i);
+      ++i;
+    }
+    recordConsumer.endMessage();
+  }
+
+  private void writePrimitive(TupleEntry record, PrimitiveType field) {
+    if (record == null) {
+      return;
+    }
+
+    switch (field.getPrimitiveTypeName()) {
+      case BINARY:
+        recordConsumer.addBinary(Binary.fromString(record.getString(field.getName())));
+        break;
+      case BOOLEAN:
+        recordConsumer.addBoolean(record.getBoolean(field.getName()));
+        break;
+      case INT32:
+        recordConsumer.addInteger(record.getInteger(field.getName()));
+        break;
+      case INT64:
+        recordConsumer.addLong(record.getLong(field.getName()));
+        break;
+      case DOUBLE:
+        recordConsumer.addDouble(record.getDouble(field.getName()));
+        break;
+      case FLOAT:
+        recordConsumer.addFloat(record.getFloat(field.getName()));
+        break;
+      case FIXED_LEN_BYTE_ARRAY:
+        throw new UnsupportedOperationException("Fixed len byte array type not implemented");
+      case INT96:
+        throw new UnsupportedOperationException("Int96 type not implemented");
+      default:
+        throw new UnsupportedOperationException(field.getName() + " type not implemented");
+    }
+  }
+}
diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml
index b122e459e..ecee1e0d7 100644
--- a/parquet-hadoop/pom.xml
+++ b/parquet-hadoop/pom.xml
@@ -61,6 +61,12 @@
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <version>1.9.5</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/LruCache.java b/parquet-hadoop/src/main/java/parquet/hadoop/LruCache.java
new file mode 100644
index 000000000..e9ecb37d2
--- /dev/null
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/LruCache.java
@@ -0,0 +1,181 @@
+package parquet.hadoop;
+
+import parquet.Log;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+/**
+ * A basic implementation of an LRU cache.  Besides evicting the least recently
+ * used entries (either based on insertion or access order), this class also
+ * checks for "stale" entries as entries are inserted or retrieved (note
+ * "staleness" is defined by the entries themselves (see
+ * {@link parquet.hadoop.LruCache.Value}).
+ *
+ * @param <K> The key type. Acts as the key in a {@link java.util.LinkedHashMap}
+ * @param <V> The value type.  Must extend {@link parquet.hadoop.LruCache.Value}
+ *           so that the "staleness" of the value can be easily determined.
+ */
+final class LruCache<K, V extends LruCache.Value<K, V>> {
+  private static final Log LOG = Log.getLog(LruCache.class);
+
+  private static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+  private final LinkedHashMap<K, V> cacheMap;
+
+  /**
+   * Constructs an access-order based LRU cache with {@code maxSize} entries.
+   * @param maxSize The maximum number of entries to store in the cache.
+   */
+  public LruCache(final int maxSize) {
+    this(maxSize, DEFAULT_LOAD_FACTOR, true);
+  }
+
+  /**
+   * Constructs an LRU cache.
+   *
+   * @param maxSize The maximum number of entries to store in the cache.
+   * @param loadFactor Used to determine the initial capacity.
+   * @param accessOrder the ordering mode - {@code true} for access-order,
+   * {@code false} for insertion-order
+   */
+  public LruCache(final int maxSize, final float loadFactor, final boolean accessOrder) {
+    int initialCapacity = Math.round(maxSize / loadFactor);
+    cacheMap =
+            new LinkedHashMap<K, V>(initialCapacity, loadFactor, accessOrder) {
+              @Override
+              public boolean removeEldestEntry(final Map.Entry<K, V> eldest) {
+                boolean result = size() > maxSize;
+                if (result) {
+                  if (Log.DEBUG) {
+                    LOG.debug("Removing eldest entry in cache: "
+                            + eldest.getKey());
+                  }
+                }
+                return result;
+              }
+            };
+  }
+
+  /**
+   * Removes the mapping for the specified key from this cache if present.
+   * @param key key whose mapping is to be removed from the cache
+   * @return the previous value associated with key, or null if there was no
+   * mapping for key.
+   */
+  public V remove(final K key) {
+    V oldValue = cacheMap.remove(key);
+    if (oldValue != null) {
+      if (Log.DEBUG) {
+        LOG.debug("Removed cache entry for '" + key + "'");
+      }
+    }
+    return oldValue;
+  }
+
+  /**
+   * Associates the specified value with the specified key in this cache. The
+   * value is only inserted if it is not null and it is considered current. If
+   * the cache previously contained a mapping for the key, the old value is
+   * replaced only if the new value is "newer" than the old one.
+   * @param key key with which the specified value is to be associated
+   * @param newValue value to be associated with the specified key
+   */
+  public void put(final K key, final V newValue) {
+    if (newValue == null || !newValue.isCurrent(key)) {
+      if (Log.WARN) {
+        LOG.warn("Ignoring new cache entry for '" + key + "' because it is "
+                + (newValue == null ? "null" : "not current"));
+      }
+      return;
+    }
+
+    V oldValue = cacheMap.get(key);
+    if (oldValue != null && oldValue.isNewerThan(newValue)) {
+      if (Log.WARN) {
+        LOG.warn("Ignoring new cache entry for '" + key + "' because "
+                + "existing cache entry is newer");
+      }
+      return;
+    }
+
+    // no existing value or new value is newer than old value
+    oldValue = cacheMap.put(key, newValue);
+    if (Log.DEBUG) {
+      if (oldValue == null) {
+        LOG.debug("Added new cache entry for '" + key + "'");
+      } else {
+        LOG.debug("Overwrote existing cache entry for '" + key + "'");
+      }
+    }
+  }
+
+  /**
+   * Removes all of the mappings from this cache. The cache will be empty
+   * after this call returns.
+   */
+  public void clear() {
+    cacheMap.clear();
+  }
+
+  /**
+   * Returns the value to which the specified key is mapped, or null if 1) the
+   * value is not current or 2) this cache contains no mapping for the key.
+   * @param key the key whose associated value is to be returned
+   * @return the value to which the specified key is mapped, or null if 1) the
+   * value is not current or 2) this cache contains no mapping for the key
+   */
+  public V getCurrentValue(final K key) {
+    V value = cacheMap.get(key);
+    if (Log.DEBUG) {
+      LOG.debug("Value for '" + key + "' " + (value == null ? "not " : "")
+              + "in cache");
+    }
+    if (value != null && !value.isCurrent(key)) {
+      // value is not current; remove it and return null
+      remove(key);
+      return null;
+    }
+
+    return value;
+  }
+
+  /**
+   * Returns the number of key-value mappings in this cache.
+   * @return the number of key-value mappings in this cache.
+   */
+  public int size() {
+    return cacheMap.size();
+  }
+
+  /**
+   * {@link parquet.hadoop.LruCache} expects all values to follow this
+   * interface so the cache can determine 1) whether values are current (e.g.
+   * the referenced data has not been modified/updated in such a way that the
+   * value is no longer useful) and 2) whether a value is strictly "newer"
+   * than another value.
+   *
+   * @param <K> The key type.
+   * @param <V> Provides a bound for the {@link #isNewerThan(V)} method
+   */
+  interface Value<K, V> {
+    /**
+     * Is the value still current (e.g. has the referenced data been
+     * modified/updated in such a way that the value is no longer useful)
+     * @param key the key associated with this value
+     * @return {@code true} the value is still current, {@code false} the value
+     * is no longer useful
+     */
+    boolean isCurrent(K key);
+
+    /**
+     * Compares this value with the specified value to check for relative age.
+     * @param otherValue the value to be compared.
+     * @return {@code true} the value is strictly newer than the other value,
+     * {@code false} the value is older or just
+     * as new as the other value.
+     */
+    boolean isNewerThan(V otherValue);
+  }
+
+}
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetInputFormat.java b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetInputFormat.java
index 16fa13eab..2eb27c249 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetInputFormat.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetInputFormat.java
@@ -18,9 +18,13 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BlockLocation;
@@ -77,8 +81,11 @@ public class ParquetInputFormat<T> extends FileInputFormat<Void, T> {
    */
   public static final String UNBOUND_RECORD_FILTER = "parquet.read.filter";
 
+  private static final int MIN_FOOTER_CACHE_SIZE = 100;
+
+  private LruCache<FileStatusWrapper, FootersCacheValue> footersCache;
+
   private Class<?> readSupportClass;
-  private List<Footer> footers;
 
   public static void setReadSupportClass(Job job,  Class<?> readSupportClass) {
     ContextUtil.getConfiguration(job).set(READ_SUPPORT_CLASS, readSupportClass.getName());
@@ -295,9 +302,7 @@ protected List<FileStatus> listStatus(JobContext jobContext) throws IOException
   private static List<FileStatus> getAllFileRecursively(
       List<FileStatus> files, Configuration conf) throws IOException {
     List<FileStatus> result = new ArrayList<FileStatus>();
-    int len = files.size();
-    for (int i = 0; i < len; ++i) {
-      FileStatus file = files.get(i);
+    for (FileStatus file : files) {
       if (file.isDir()) {
         Path p = file.getPath();
         FileSystem fs = p.getFileSystem(conf);
@@ -335,10 +340,58 @@ public boolean accept(Path p){
    * @throws IOException
    */
   public List<Footer> getFooters(JobContext jobContext) throws IOException {
-    if (footers == null) {
-      footers = getFooters(ContextUtil.getConfiguration(jobContext), listStatus(jobContext));
+    List<FileStatus> statuses = listStatus(jobContext);
+    if (statuses.isEmpty()) {
+      return Collections.emptyList();
+    }
+
+    Configuration config = ContextUtil.getConfiguration(jobContext);
+    List<Footer> footers = new ArrayList<Footer>(statuses.size());
+    Set<FileStatus> missingStatuses = new HashSet<FileStatus>();
+    Map<Path, FileStatusWrapper> missingStatusesMap =
+            new HashMap<Path, FileStatusWrapper>(missingStatuses.size());
+
+    if (footersCache == null) {
+      footersCache =
+              new LruCache<FileStatusWrapper, FootersCacheValue>(Math.max(statuses.size(), MIN_FOOTER_CACHE_SIZE));
+    }
+    for (FileStatus status : statuses) {
+      FileStatusWrapper statusWrapper = new FileStatusWrapper(status);
+      FootersCacheValue cacheEntry =
+              footersCache.getCurrentValue(statusWrapper);
+      if (Log.DEBUG) {
+        LOG.debug("Cache entry " + (cacheEntry == null ? "not " : "")
+                + " found for '" + status.getPath() + "'");
+      }
+      if (cacheEntry != null) {
+        footers.add(cacheEntry.getFooter());
+      } else {
+        missingStatuses.add(status);
+        missingStatusesMap.put(status.getPath(), statusWrapper);
+      }
+    }
+    if (Log.DEBUG) {
+      LOG.debug("found " + footers.size() + " footers in cache and adding up "
+              + "to " + missingStatuses.size() + " missing footers to the cache");
+    }
+
+
+    if (missingStatuses.isEmpty()) {
+      return footers;
+    }
+
+    List<Footer> newFooters =
+            getFooters(config, new ArrayList<FileStatus>(missingStatuses));
+    for (Footer newFooter : newFooters) {
+      // Use the original file status objects to make sure we store a
+      // conservative (older) modification time (i.e. in case the files and
+      // footers were modified and it's not clear which version of the footers
+      // we have)
+      FileStatusWrapper fileStatus = missingStatusesMap.get(newFooter.getFile());
+      footersCache.put(fileStatus, new FootersCacheValue(fileStatus, newFooter));
     }
 
+    footers.addAll(newFooters);
     return footers;
   }
 
@@ -350,7 +403,7 @@ public List<Footer> getFooters(JobContext jobContext) throws IOException {
    * @throws IOException
    */
   public List<Footer> getFooters(Configuration configuration, List<FileStatus> statuses) throws IOException {
-    LOG.debug("reading " + statuses.size() + " files");
+    if (Log.DEBUG) LOG.debug("reading " + statuses.size() + " files");
     return ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, statuses);
   }
 
@@ -363,4 +416,80 @@ public GlobalMetaData getGlobalMetaData(JobContext jobContext) throws IOExceptio
     return ParquetFileWriter.getGlobalMetaData(getFooters(jobContext));
   }
 
+  /**
+   * A simple wrapper around {@link parquet.hadoop.Footer} that also includes a
+   * modification time associated with that footer.  The modification time is
+   * used to determine whether the footer is still current.
+   */
+  static final class FootersCacheValue
+          implements LruCache.Value<FileStatusWrapper, FootersCacheValue> {
+    private final long modificationTime;
+    private final Footer footer;
+
+    public FootersCacheValue(FileStatusWrapper status, Footer footer) {
+      this.modificationTime = status.getModificationTime();
+      this.footer = new Footer(footer.getFile(), footer.getParquetMetadata());
+    }
+
+    @Override
+    public boolean isCurrent(FileStatusWrapper key) {
+      long currentModTime = key.getModificationTime();
+      boolean isCurrent = modificationTime >= currentModTime;
+      if (Log.DEBUG && !isCurrent) {
+        LOG.debug("The cache value for '" + key + "' is not current: "
+                + "cached modification time=" + modificationTime + ", "
+                + "current modification time: " + currentModTime);
+      }
+      return isCurrent;
+    }
+
+    public Footer getFooter() {
+      return footer;
+    }
+
+    @Override
+    public boolean isNewerThan(FootersCacheValue otherValue) {
+      return otherValue == null ||
+              modificationTime > otherValue.modificationTime;
+    }
+
+    public Path getPath() {
+      return footer.getFile();
+    }
+  }
+
+  /**
+   * A simple wrapper around {@link org.apache.hadoop.fs.FileStatus} with a
+   * meaningful "toString()" method
+   */
+  static final class FileStatusWrapper {
+    private final FileStatus status;
+    public FileStatusWrapper(FileStatus fileStatus) {
+      if (fileStatus == null) {
+        throw new IllegalArgumentException("FileStatus object cannot be null");
+      }
+      status = fileStatus;
+    }
+
+    public long getModificationTime() {
+      return status.getModificationTime();
+    }
+
+    @Override
+    public int hashCode() {
+      return status.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      return other instanceof FileStatusWrapper &&
+              status.equals(((FileStatusWrapper) other).status);
+    }
+
+    @Override
+    public String toString() {
+      return status.getPath().toString();
+    }
+  }
+
 }
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
index 4c3ce0f5f..ca7c51201 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/ParquetOutputFormat.java
@@ -27,6 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.OutputCommitter;
@@ -101,6 +102,10 @@ public static void setWriteSupportClass(Job job,  Class<?> writeSupportClass) {
     getConfiguration(job).set(WRITE_SUPPORT_CLASS, writeSupportClass.getName());
   }
 
+  public static void setWriteSupportClass(JobConf job, Class<?> writeSupportClass) {
+      job.set(WRITE_SUPPORT_CLASS, writeSupportClass.getName());
+  }
+
   public static Class<?> getWriteSupportClass(Configuration configuration) {
     final String className = configuration.get(WRITE_SUPPORT_CLASS);
     if (className == null) {
diff --git a/parquet-hadoop/src/test/java/parquet/hadoop/TestInputFormat.java b/parquet-hadoop/src/test/java/parquet/hadoop/TestInputFormat.java
index af9ce4588..3630c9381 100644
--- a/parquet-hadoop/src/test/java/parquet/hadoop/TestInputFormat.java
+++ b/parquet-hadoop/src/test/java/parquet/hadoop/TestInputFormat.java
@@ -24,8 +24,10 @@
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.junit.Test;
 
@@ -36,10 +38,16 @@
 import parquet.hadoop.metadata.ColumnPath;
 import parquet.hadoop.metadata.CompressionCodecName;
 import parquet.hadoop.metadata.FileMetaData;
+import parquet.hadoop.metadata.ParquetMetadata;
 import parquet.schema.MessageType;
 import parquet.schema.MessageTypeParser;
 import parquet.schema.PrimitiveType.PrimitiveTypeName;
 
+import java.io.File;
+
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.mock;
+
 public class TestInputFormat {
 
   @Test
@@ -49,15 +57,15 @@ public void testBlocksToSplits() throws IOException, InterruptedException {
       blocks.add(newBlock(i * 10));
     }
     BlockLocation[] hdfsBlocks = new BlockLocation[] {
-        new BlockLocation(new String[0], new String[] { "foo0.datanode", "bar0.datanode"}, 0, 50),
-        new BlockLocation(new String[0], new String[] { "foo1.datanode", "bar1.datanode"}, 50, 50)
+            new BlockLocation(new String[0], new String[] { "foo0.datanode", "bar0.datanode"}, 0, 50),
+            new BlockLocation(new String[0], new String[] { "foo1.datanode", "bar1.datanode"}, 50, 50)
     };
     FileStatus fileStatus = new FileStatus(100, false, 2, 50, 0, new Path("hdfs://foo.namenode:1234/bar"));
     MessageType schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
     FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
     @SuppressWarnings("serial")
     List<ParquetInputSplit> splits = ParquetInputFormat.generateSplits(
-        blocks, hdfsBlocks, fileStatus, fileMetaData, ReadSupport.class, schema.toString(), new HashMap<String, String>() {{put("specific", "foo");}});
+            blocks, hdfsBlocks, fileStatus, fileMetaData, ReadSupport.class, schema.toString(), new HashMap<String, String>() {{put("specific", "foo");}});
     assertEquals(splits.toString().replaceAll("([{])", "$0\n").replaceAll("([}])", "\n$0"), 2, splits.size());
     for (int i = 0; i < splits.size(); i++) {
       ParquetInputSplit parquetInputSplit = splits.get(i);
@@ -72,9 +80,53 @@ public void testBlocksToSplits() throws IOException, InterruptedException {
   private BlockMetaData newBlock(long start) {
     BlockMetaData blockMetaData = new BlockMetaData();
     ColumnChunkMetaData column = ColumnChunkMetaData.get(
-        ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
-        start, 0l, 0l, 2l, 0l);
+            ColumnPath.get("foo"), PrimitiveTypeName.BINARY, CompressionCodecName.GZIP, new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
+            start, 0l, 0l, 2l, 0l);
     blockMetaData.addColumn(column);
     return blockMetaData;
   }
+
+  @Test
+  public void testFooterCacheValueIsCurrent() throws IOException, InterruptedException {
+    File tempFile = getTempFile();
+    FileSystem fs = FileSystem.getLocal(new Configuration());
+    ParquetInputFormat.FootersCacheValue cacheValue = getDummyCacheValue(tempFile, fs);
+
+    assertTrue(tempFile.setLastModified(tempFile.lastModified() + 5000));
+    assertFalse(cacheValue.isCurrent(new ParquetInputFormat.FileStatusWrapper(fs.getFileStatus(new Path(tempFile.getAbsolutePath())))));
+  }
+
+  @Test
+  public void testFooterCacheValueIsNewer() throws IOException {
+    File tempFile = getTempFile();
+    FileSystem fs = FileSystem.getLocal(new Configuration());
+    ParquetInputFormat.FootersCacheValue cacheValue = getDummyCacheValue(tempFile, fs);
+
+    assertTrue(cacheValue.isNewerThan(null));
+    assertFalse(cacheValue.isNewerThan(cacheValue));
+
+    assertTrue(tempFile.setLastModified(tempFile.lastModified() + 5000));
+    ParquetInputFormat.FootersCacheValue newerCacheValue = getDummyCacheValue(tempFile, fs);
+
+    assertTrue(newerCacheValue.isNewerThan(cacheValue));
+    assertFalse(cacheValue.isNewerThan(newerCacheValue));
+  }
+
+  private File getTempFile() throws IOException {
+    File tempFile = File.createTempFile("footer_", ".txt");
+    tempFile.deleteOnExit();
+    return tempFile;
+  }
+
+  private ParquetInputFormat.FootersCacheValue getDummyCacheValue(File file, FileSystem fs) throws IOException {
+    Path path = new Path(file.getPath());
+    FileStatus status = fs.getFileStatus(path);
+    ParquetInputFormat.FileStatusWrapper statusWrapper = new ParquetInputFormat.FileStatusWrapper(status);
+    ParquetMetadata mockMetadata = mock(ParquetMetadata.class);
+    ParquetInputFormat.FootersCacheValue cacheValue =
+            new ParquetInputFormat.FootersCacheValue(statusWrapper, new Footer(path, mockMetadata));
+    assertTrue(cacheValue.isCurrent(statusWrapper));
+    return cacheValue;
+  }
+
 }
diff --git a/parquet-hadoop/src/test/java/parquet/hadoop/TestLruCache.java b/parquet-hadoop/src/test/java/parquet/hadoop/TestLruCache.java
new file mode 100644
index 000000000..91e24d863
--- /dev/null
+++ b/parquet-hadoop/src/test/java/parquet/hadoop/TestLruCache.java
@@ -0,0 +1,144 @@
+package parquet.hadoop;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class TestLruCache {
+  private static final String DEFAULT_KEY = "test";
+
+  private static final class SimpleValue implements LruCache.Value<String, SimpleValue> {
+    private boolean current;
+    private boolean newerThan;
+
+    public SimpleValue(boolean current, boolean newerThan) {
+      this.current = current;
+      this.newerThan = newerThan;
+    }
+
+    @Override
+    public boolean isCurrent(String key) {
+      return current;
+    }
+
+    public void setCurrent(boolean current) {
+      this.current = current;
+    }
+
+    @Override
+    public boolean isNewerThan(SimpleValue otherValue) {
+      return newerThan;
+    }
+
+  }
+
+  @Test
+  public void testMaxSize() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    String oldKey = DEFAULT_KEY;
+    String newKey = oldKey + "_new";
+
+    SimpleValue oldValue = new SimpleValue(true, true);
+    cache.put(oldKey, oldValue);
+    assertEquals(oldValue, cache.getCurrentValue(oldKey));
+    assertEquals(1, cache.size());
+
+    SimpleValue newValue = new SimpleValue(true, true);
+    cache.put(newKey, newValue);
+    assertNull(cache.getCurrentValue(oldKey));
+    assertEquals(newValue, cache.getCurrentValue(newKey));
+    assertEquals(1, cache.size());
+  }
+
+  @Test
+  public void testOlderValueIsIgnored() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    SimpleValue currentValue = new SimpleValue(true, true);
+    SimpleValue notAsCurrentValue = new SimpleValue(true, false);
+    cache.put(DEFAULT_KEY, currentValue);
+    cache.put(DEFAULT_KEY, notAsCurrentValue);
+    assertEquals(
+            "The existing value in the cache was overwritten",
+            currentValue,
+            cache.getCurrentValue(DEFAULT_KEY)
+    );
+  }
+
+  @Test
+  public void testOutdatedValueIsIgnored() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    SimpleValue outdatedValue = new SimpleValue(false, true);
+    cache.put(DEFAULT_KEY, outdatedValue);
+    assertEquals(0, cache.size());
+    assertNull(cache.getCurrentValue(DEFAULT_KEY));
+  }
+
+  @Test
+  public void testCurrentValueOverwritesExisting() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    SimpleValue currentValue = new SimpleValue(true, true);
+    SimpleValue notAsCurrentValue = new SimpleValue(true, false);
+    cache.put(DEFAULT_KEY, notAsCurrentValue);
+    assertEquals(1, cache.size());
+    cache.put(DEFAULT_KEY, currentValue);
+    assertEquals(1, cache.size());
+    assertEquals(
+            "The existing value in the cache was NOT overwritten",
+            currentValue,
+            cache.getCurrentValue(DEFAULT_KEY)
+    );
+  }
+
+  @Test
+  public void testGetOutdatedValueReturnsNull() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    SimpleValue value = new SimpleValue(true, true);
+    cache.put(DEFAULT_KEY, value);
+    assertEquals(1, cache.size());
+    assertEquals(value, cache.getCurrentValue(DEFAULT_KEY));
+
+    value.setCurrent(false);
+    assertNull("The value should not be current anymore", cache.getCurrentValue(DEFAULT_KEY));
+    assertEquals(0, cache.size());
+  }
+
+  @Test
+  public void testRemove() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(1);
+
+    SimpleValue value = new SimpleValue(true, true);
+    cache.put(DEFAULT_KEY, value);
+    assertEquals(1, cache.size());
+    assertEquals(value, cache.getCurrentValue(DEFAULT_KEY));
+
+    // remove the only value
+    assertEquals(value, cache.remove(DEFAULT_KEY));
+    assertNull(cache.getCurrentValue(DEFAULT_KEY));
+    assertEquals(0, cache.size());
+  }
+
+  @Test
+  public void testClear() {
+    LruCache<String, SimpleValue> cache = new LruCache<String, SimpleValue>(2);
+
+    String key1 = DEFAULT_KEY + 1;
+    String key2 = DEFAULT_KEY + 2;
+    SimpleValue value = new SimpleValue(true, true);
+    cache.put(key1, value);
+    cache.put(key2, value);
+    assertEquals(value, cache.getCurrentValue(key1));
+    assertEquals(value, cache.getCurrentValue(key2));
+    assertEquals(2, cache.size());
+
+    cache.clear();
+    assertNull(cache.getCurrentValue(key1));
+    assertNull(cache.getCurrentValue(key2));
+    assertEquals(0, cache.size());
+  }
+
+}