dpkp
diff --git a/‎kafka/consumer/fetcher.py
+27-72 b/‎kafka/consumer/fetcher.py
+27-72
diff --git a/‎kafka/errors.py
+5-2 b/‎kafka/errors.py
+5-2
diff --git a/‎kafka/producer/buffer.py
+1-125 b/‎kafka/producer/buffer.py
+1-125
diff --git a/‎kafka/producer/kafka.py
+34-9 b/‎kafka/producer/kafka.py
+34-9
@@ -13,10 +13,10 @@
 from kafka.future import Future
 from kafka.metrics.stats import Avg, Count, Max, Rate
 from kafka.protocol.fetch import FetchRequest
-from kafka.protocol.message import PartialMessage
 from kafka.protocol.offset import (
     OffsetRequest, OffsetResetStrategy, UNKNOWN_OFFSET
 )
+from kafka.record import MemoryRecords
 from kafka.serializer import Deserializer
 from kafka.structs import TopicPartition, OffsetAndTimestamp
 
@@ -295,7 +295,7 @@ def fetched_records(self, max_records=None):
 
         Raises:
             OffsetOutOfRangeError: if no subscription offset_reset_strategy
-            InvalidMessageError: if message crc validation fails (check_crcs
+            CorruptRecordException: if message crc validation fails (check_crcs
                 must be set to True)
             RecordTooLargeError: if a message is larger than the currently
                 configured max_partition_fetch_bytes
@@ -440,57 +440,25 @@ def _message_generator(self):
 
             self._next_partition_records = None
 
-    def _unpack_message_set(self, tp, messages):
+    def _unpack_message_set(self, tp, records):
         try:
-            for offset, size, msg in messages:
-                if self.config['check_crcs'] and not msg.validate_crc():
-                    raise Errors.InvalidMessageError(msg)
-
-                if not msg.is_compressed():
-                    yield self._parse_record(tp, offset, msg.timestamp, msg)
-
-                else:
-                    # If relative offset is used, we need to decompress the entire message first
-                    # to compute the absolute offset.
-                    inner_mset = msg.decompress()
-
-                    # There should only ever be a single layer of compression
-                    if inner_mset[0][-1].is_compressed():
-                        log.warning('MessageSet at %s offset %d appears '
-                                    ' double-compressed. This should not'
-                                    ' happen -- check your producers!',
-                                    tp, offset)
-                        if self.config['skip_double_compressed_messages']:
-                            log.warning('Skipping double-compressed message at'
-                                        ' %s %d', tp, offset)
-                            continue
-
-                    if msg.magic > 0:
-                        last_offset, _, _ = inner_mset[-1]
-                        absolute_base_offset = offset - last_offset
-                    else:
-                        absolute_base_offset = -1
-
-                    for inner_offset, inner_size, inner_msg in inner_mset:
-                        if msg.magic > 0:
-                            # When magic value is greater than 0, the timestamp
-                            # of a compressed message depends on the
-                            # typestamp type of the wrapper message:
-
-                            if msg.timestamp_type == 0:  # CREATE_TIME (0)
-                                inner_timestamp = inner_msg.timestamp
-
-                            elif msg.timestamp_type == 1:  # LOG_APPEND_TIME (1)
-                                inner_timestamp = msg.timestamp
-
-                            else:
-                                raise ValueError('Unknown timestamp type: {0}'.format(msg.timestamp_type))
-                        else:
-                            inner_timestamp = msg.timestamp
-
-                        if absolute_base_offset >= 0:
-                            inner_offset += absolute_base_offset
-                        yield self._parse_record(tp, inner_offset, inner_timestamp, inner_msg)
+            batch = records.next_batch()
+            while batch is not None:
+                for record in batch:
+                    key_size = len(record.key) if record.key is not None else -1
+                    value_size = len(record.value) if record.value is not None else -1
+                    key = self._deserialize(
+                        self.config['key_deserializer'],
+                        tp.topic, record.key)
+                    value = self._deserialize(
+                        self.config['value_deserializer'],
+                        tp.topic, record.value)
+                    yield ConsumerRecord(
+                        tp.topic, tp.partition, record.offset, record.timestamp,
+                        record.timestamp_type, key, value, record.checksum,
+                        key_size, value_size)
+
+                batch = records.next_batch()
 
         # If unpacking raises StopIteration, it is erroneously
         # caught by the generator. We want all exceptions to be raised
@@ -505,15 +473,6 @@ def _unpack_message_set(self, tp, messages):
             log.exception('AssertionError raised unpacking messageset: %s', e)
             raise
 
-    def _parse_record(self, tp, offset, timestamp, msg):
-        key = self._deserialize(self.config['key_deserializer'], tp.topic, msg.key)
-        value = self._deserialize(self.config['value_deserializer'], tp.topic, msg.value)
-        return ConsumerRecord(tp.topic, tp.partition, offset,
-                              timestamp, msg.timestamp_type,
-                              key, value, msg.crc,
-                              len(msg.key) if msg.key is not None else -1,
-                              len(msg.value) if msg.value is not None else -1)
-
     def __iter__(self):  # pylint: disable=non-iterator-returned
         return self
 
@@ -783,7 +742,7 @@ def _parse_fetched_data(self, completed_fetch):
 
         error_code, highwater = completed_fetch.partition_data[:2]
         error_type = Errors.for_code(error_code)
-        messages = completed_fetch.partition_data[-1]
+        records = MemoryRecords(partition_data[-1])
 
         try:
             if not self._subscriptions.is_fetchable(tp):
@@ -807,21 +766,17 @@ def _parse_fetched_data(self, completed_fetch):
                               position)
                     return None
 
-                partial = None
-                if messages and isinstance(messages[-1][-1], PartialMessage):
-                    partial = messages.pop()
-
-                if messages:
+                if records.has_next():
                     log.debug("Adding fetched record for partition %s with"
                               " offset %d to buffered record list", tp,
                               position)
-                    unpacked = list(self._unpack_message_set(tp, messages))
+                    unpacked = list(self._unpack_message_set(tp, records))
                     parsed_records = self.PartitionRecords(fetch_offset, tp, unpacked)
-                    last_offset, _, _ = messages[-1]
+                    last_offset = unpacked[-1].offset
                     self._sensors.records_fetch_lag.record(highwater - last_offset)
-                    num_bytes = sum(msg[1] for msg in messages)
-                    records_count = len(messages)
-                elif partial:
+                    num_bytes = records.valid_bytes()
+                    records_count = len(unpacked)
+                elif records.size_in_bytes() > 0:
                     # we did not read a single message from a non-empty
                     # buffer because that message's size is larger than
                     # fetch size, in this case record this exception
 
@@ -101,12 +101,15 @@ class OffsetOutOfRangeError(BrokerResponseError):
                    ' maintained by the server for the given topic/partition.')
 
 
-class InvalidMessageError(BrokerResponseError):
+class CorruptRecordException(BrokerResponseError):
     errno = 2
-    message = 'INVALID_MESSAGE'
+    message = 'CORRUPT_MESSAGE'
     description = ('This message has failed its CRC checksum, exceeds the'
                    ' valid size, or is otherwise corrupt.')
 
+# Backward compatibility
+InvalidMessageError = CorruptRecordException
+
 
 class UnknownTopicOrPartitionError(BrokerResponseError):
     errno = 3
 
@@ -5,133 +5,9 @@
 import threading
 import time
 
-from ..codec import (has_gzip, has_snappy, has_lz4,
-                     gzip_encode, snappy_encode,
-                     lz4_encode, lz4_encode_old_kafka)
-from .. import errors as Errors
 from ..metrics.stats import Rate
-from ..protocol.types import Int32, Int64
-from ..protocol.message import MessageSet, Message
 
-
-
-class MessageSetBuffer(object):
-    """Wrap a buffer for writing MessageSet batches.
-
-    Arguments:
-        buf (IO stream): a buffer for writing data. Typically BytesIO.
-        batch_size (int): maximum number of bytes to write to the buffer.
-
-    Keyword Arguments:
-        compression_type ('gzip', 'snappy', None): compress messages before
-            publishing. Default: None.
-    """
-    _COMPRESSORS = {
-        'gzip': (has_gzip, gzip_encode, Message.CODEC_GZIP),
-        'snappy': (has_snappy, snappy_encode, Message.CODEC_SNAPPY),
-        'lz4': (has_lz4, lz4_encode, Message.CODEC_LZ4),
-        'lz4-old-kafka': (has_lz4, lz4_encode_old_kafka, Message.CODEC_LZ4),
-    }
-    def __init__(self, buf, batch_size, compression_type=None, message_version=0):
-        if compression_type is not None:
-            assert compression_type in self._COMPRESSORS, 'Unrecognized compression type'
-
-            # Kafka 0.8/0.9 had a quirky lz4...
-            if compression_type == 'lz4' and message_version == 0:
-                compression_type = 'lz4-old-kafka'
-
-            checker, encoder, attributes = self._COMPRESSORS[compression_type]
-            assert checker(), 'Compression Libraries Not Found'
-            self._compressor = encoder
-            self._compression_attributes = attributes
-        else:
-            self._compressor = None
-            self._compression_attributes = None
-
-        self._message_version = message_version
-        self._buffer = buf
-        # Init MessageSetSize to 0 -- update on close
-        self._buffer.seek(0)
-        self._buffer.write(Int32.encode(0))
-        self._batch_size = batch_size
-        self._closed = False
-        self._messages = 0
-        self._bytes_written = 4 # Int32 header is 4 bytes
-        self._final_size = None
-
-    def append(self, offset, message):
-        """Append a Message to the MessageSet.
-
-        Arguments:
-            offset (int): offset of the message
-            message (Message or bytes): message struct or encoded bytes
-
-        Returns: bytes written
-        """
-        if isinstance(message, Message):
-            encoded = message.encode()
-        else:
-            encoded = bytes(message)
-        msg = Int64.encode(offset) + Int32.encode(len(encoded)) + encoded
-        self._buffer.write(msg)
-        self._messages += 1
-        self._bytes_written += len(msg)
-        return len(msg)
-
-    def has_room_for(self, key, value):
-        if self._closed:
-            return False
-        if not self._messages:
-            return True
-        needed_bytes = MessageSet.HEADER_SIZE + Message.HEADER_SIZE
-        if key is not None:
-            needed_bytes += len(key)
-        if value is not None:
-            needed_bytes += len(value)
-        return self._buffer.tell() + needed_bytes < self._batch_size
-
-    def is_full(self):
-        if self._closed:
-            return True
-        return self._buffer.tell() >= self._batch_size
-
-    def close(self):
-        # This method may be called multiple times on the same batch
-        # i.e., on retries
-        # we need to make sure we only close it out once
-        # otherwise compressed messages may be double-compressed
-        # see Issue 718
-        if not self._closed:
-            if self._compressor:
-                # TODO: avoid copies with bytearray / memoryview
-                uncompressed_size = self._buffer.tell()
-                self._buffer.seek(4)
-                msg = Message(self._compressor(self._buffer.read(uncompressed_size - 4)),
-                              attributes=self._compression_attributes,
-                              magic=self._message_version)
-                encoded = msg.encode()
-                self._buffer.seek(4)
-                self._buffer.write(Int64.encode(0)) # offset 0 for wrapper msg
-                self._buffer.write(Int32.encode(len(encoded)))
-                self._buffer.write(encoded)
-
-            # Update the message set size (less the 4 byte header),
-            # and return with buffer ready for full read()
-            self._final_size = self._buffer.tell()
-            self._buffer.seek(0)
-            self._buffer.write(Int32.encode(self._final_size - 4))
-
-        self._buffer.seek(0)
-        self._closed = True
-
-    def size_in_bytes(self):
-        return self._final_size or self._buffer.tell()
-
-    def compression_rate(self):
-        return self.size_in_bytes() / self._bytes_written
-
-    def buffer(self):
-        return self._buffer
+import kafka.errors as Errors
 
 
 class SimpleBufferPool(object):
 
@@ -12,9 +12,10 @@
 
 from .. import errors as Errors
 from ..client_async import KafkaClient, selectors
+from ..codec import has_gzip, has_snappy, has_lz4
 from ..metrics import MetricConfig, Metrics
 from ..partitioner.default import DefaultPartitioner
-from ..protocol.message import Message, MessageSet
+from ..record.legacy_records import LegacyRecordBatchBuilder
 from ..serializer import Serializer
 from ..structs import TopicPartition
 from .future import FutureRecordMetadata, FutureProduceResult
@@ -310,6 +311,13 @@ class KafkaProducer(object):
         'sasl_plain_password': None,
     }
 
+    _COMPRESSORS = {
+        'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
+        'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
+        'lz4': (has_lz4, LegacyRecordBatchBuilder.CODEC_LZ4),
+        None: (lambda: True, LegacyRecordBatchBuilder.CODEC_NONE),
+    }
+
     def __init__(self, **configs):
         log.debug("Starting the Kafka producer")  # trace
         self.config = copy.copy(self.DEFAULT_CONFIG)
@@ -355,7 +363,16 @@ def __init__(self, **configs):
         if self.config['compression_type'] == 'lz4':
             assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'
 
-        message_version = 1 if self.config['api_version'] >= (0, 10) else 0
+        # Check compression_type for library support
+        ct = self.config['compression_type']
+        if ct not in self._COMPRESSORS:
+            raise ValueError("Not supported codec: {}".format(ct))
+        else:
+            checker, compression_attrs = self._COMPRESSORS[ct]
+            assert checker(), "Libraries for {} compression codec not found".format(ct)
+            self.config['compression_type'] = compression_attrs
+
+        message_version = self._max_usable_produce_magic()
         self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
         self._metadata = client.cluster
         guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
@@ -465,6 +482,17 @@ def partitions_for(self, topic):
         max_wait = self.config['max_block_ms'] / 1000.0
         return self._wait_on_metadata(topic, max_wait)
 
+    def _max_usable_produce_magic(self):
+        if self.config['api_version'] >= (0, 10):
+            return 1
+        else:
+            return 0
+
+    def _estimate_size_in_bytes(self, key, value):
+        magic = self._max_usable_produce_magic()
+        return LegacyRecordBatchBuilder.estimate_size_in_bytes(
+            magic, self.config['compression_type'], key, value)
+
     def send(self, topic, value=None, key=None, partition=None, timestamp_ms=None):
         """Publish a message to a topic.
 
@@ -514,11 +542,7 @@ def send(self, topic, value=None, key=None, partition=None, timestamp_ms=None):
             partition = self._partition(topic, partition, key, value,
                                         key_bytes, value_bytes)
 
-            message_size = MessageSet.HEADER_SIZE + Message.HEADER_SIZE
-            if key_bytes is not None:
-                message_size += len(key_bytes)
-            if value_bytes is not None:
-                message_size += len(value_bytes)
+            message_size = self._estimate_size_in_bytes(key, value)
             self._ensure_valid_record_size(message_size)
 
             tp = TopicPartition(topic, partition)
@@ -527,11 +551,12 @@ def send(self, topic, value=None, key=None, partition=None, timestamp_ms=None):
             log.debug("Sending (key=%r value=%r) to %s", key, value, tp)
             result = self._accumulator.append(tp, timestamp_ms,
                                               key_bytes, value_bytes,
-                                              self.config['max_block_ms'])
+                                              self.config['max_block_ms'],
+                                              estimated_size=message_size)
             future, batch_is_full, new_batch_created = result
             if batch_is_full or new_batch_created:
                 log.debug("Waking up the sender since %s is either full or"
-                           " getting a new batch", tp)
+                          " getting a new batch", tp)
                 self._sender.wakeup()
 
             return future