diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index d37ae2ab99260..ba325e15bc38a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -197,10 +197,30 @@ private static synchronized Configuration addRequiredProjectionFields(Configurat return configuration; } + /** + * Hive will append read columns' ids to old columns' ids during getRecordReader. In some cases, e.g. SELECT COUNT(*), + * the read columns' id is an empty string and Hive will combine it with Hoodie required projection ids and becomes + * e.g. ",2,0,3" and will cause an error. Actually this method is a temporary solution because the real bug is from + * Hive. Hive has fixed this bug after 3.0.0, but the version before that would still face this problem. (HIVE-22438) + */ + private static Configuration cleanProjectionColumnIds(Configuration conf) { + synchronized (conf) { + String columnIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR); + if (!columnIds.isEmpty() && columnIds.charAt(0) == ',') { + conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, columnIds.substring(1)); + if (LOG.isDebugEnabled()) { + LOG.debug("The projection Ids: {" + columnIds + "} start with ','. First comma is removed"); + } + } + } + return conf; + } + @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { + this.conf = cleanProjectionColumnIds(job); LOG.info("Before adding Hoodie columns, Projections :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));