From 671b74ab2e035a2f2ba7f90419794f6dbec08366 Mon Sep 17 00:00:00 2001 From: Patrick Lucas Date: Sat, 3 May 2014 11:27:57 -0700 Subject: [PATCH 1/2] Add 'codec' parameter to Producer Adds a codec parameter to Producer.__init__ that lets the user choose a compression codec to use for all messages sent by it. --- kafka/producer.py | 42 +++++++++++++++++++++++++++++++++--------- kafka/protocol.py | 23 ++++++++++++----------- test/test_unit.py | 8 ++++---- 3 files changed, 49 insertions(+), 24 deletions(-) diff --git a/kafka/producer.py b/kafka/producer.py index 12a293401..563d160a4 100644 --- a/kafka/producer.py +++ b/kafka/producer.py @@ -10,7 +10,10 @@ from kafka.common import ProduceRequest, TopicAndPartition from kafka.partitioner import HashedPartitioner -from kafka.protocol import create_message +from kafka.protocol import ( + CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, ALL_CODECS, + create_message, create_gzip_message, create_snappy_message, +) log = logging.getLogger("kafka") @@ -20,7 +23,7 @@ STOP_ASYNC_PRODUCER = -1 -def _send_upstream(queue, client, batch_time, batch_size, +def _send_upstream(queue, client, codec, batch_time, batch_size, req_acks, ack_timeout): """ Listen on the queue for a specified number of messages or till @@ -61,7 +64,14 @@ def _send_upstream(queue, client, batch_time, batch_size, # Send collected requests upstream reqs = [] - for topic_partition, messages in msgset.items(): + for topic_partition, msg in msgset.items(): + if codec == CODEC_GZIP: + messages = [create_gzip_message(msg)] + elif codec == CODEC_SNAPPY: + messages = [create_snappy_message(msg)] + else: + messages = [create_message(m) for m in msg] + req = ProduceRequest(topic_partition.topic, topic_partition.partition, messages) @@ -101,6 +111,7 @@ class Producer(object): def __init__(self, client, async=False, req_acks=ACK_AFTER_LOCAL_WRITE, ack_timeout=DEFAULT_ACK_TIMEOUT, + codec=None, batch_send=False, batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): @@ -118,11 +129,17 @@ def __init__(self, client, async=False, self.req_acks = req_acks self.ack_timeout = ack_timeout + if codec is None: + codec = CODEC_NONE + assert codec in ALL_CODECS + self.codec = codec + if self.async: self.queue = Queue() # Messages are sent through this queue self.proc = Process(target=_send_upstream, args=(self.queue, self.client.copy(), + self.codec, batch_send_every_t, batch_send_every_n, self.req_acks, @@ -138,11 +155,16 @@ def send_messages(self, topic, partition, *msg): """ if self.async: for m in msg: - self.queue.put((TopicAndPartition(topic, partition), - create_message(m))) + self.queue.put((TopicAndPartition(topic, partition), m)) resp = [] else: - messages = [create_message(m) for m in msg] + if self.codec == CODEC_GZIP: + messages = [create_gzip_message(msg)] + elif self.codec == CODEC_SNAPPY: + messages = [create_snappy_message(msg)] + else: + messages = [create_message(m) for m in msg] + req = ProduceRequest(topic, partition, messages) try: resp = self.client.send_produce_request([req], acks=self.req_acks, @@ -167,7 +189,7 @@ def stop(self, timeout=1): class SimpleProducer(Producer): """ - A simple, round-robbin producer. Each message goes to exactly one partition + A simple, round-robin producer. Each message goes to exactly one partition Params: client - The Kafka client instance to use @@ -184,12 +206,13 @@ class SimpleProducer(Producer): def __init__(self, client, async=False, req_acks=Producer.ACK_AFTER_LOCAL_WRITE, ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, + codec=None, batch_send=False, batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): self.partition_cycles = {} super(SimpleProducer, self).__init__(client, async, req_acks, - ack_timeout, batch_send, + ack_timeout, codec, batch_send, batch_send_every_n, batch_send_every_t) @@ -227,6 +250,7 @@ class KeyedProducer(Producer): def __init__(self, client, partitioner=None, async=False, req_acks=Producer.ACK_AFTER_LOCAL_WRITE, ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, + codec=None, batch_send=False, batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): @@ -236,7 +260,7 @@ def __init__(self, client, partitioner=None, async=False, self.partitioners = {} super(KeyedProducer, self).__init__(client, async, req_acks, - ack_timeout, batch_send, + ack_timeout, codec, batch_send, batch_send_every_n, batch_send_every_t) diff --git a/kafka/protocol.py b/kafka/protocol.py index 25be023eb..10557be83 100644 --- a/kafka/protocol.py +++ b/kafka/protocol.py @@ -18,6 +18,12 @@ log = logging.getLogger("kafka") +ATTRIBUTE_CODEC_MASK = 0x03 +CODEC_NONE = 0x00 +CODEC_GZIP = 0x01 +CODEC_SNAPPY = 0x02 +ALL_CODECS = (CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY) + class KafkaProtocol(object): """ @@ -32,11 +38,6 @@ class KafkaProtocol(object): OFFSET_COMMIT_KEY = 8 OFFSET_FETCH_KEY = 9 - ATTRIBUTE_CODEC_MASK = 0x03 - CODEC_NONE = 0x00 - CODEC_GZIP = 0x01 - CODEC_SNAPPY = 0x02 - ################### # Private API # ################### @@ -151,17 +152,17 @@ def _decode_message(cls, data, offset): (key, cur) = read_int_string(data, cur) (value, cur) = read_int_string(data, cur) - codec = att & KafkaProtocol.ATTRIBUTE_CODEC_MASK + codec = att & ATTRIBUTE_CODEC_MASK - if codec == KafkaProtocol.CODEC_NONE: + if codec == CODEC_NONE: yield (offset, Message(magic, att, key, value)) - elif codec == KafkaProtocol.CODEC_GZIP: + elif codec == CODEC_GZIP: gz = gzip_decode(value) for (offset, msg) in KafkaProtocol._decode_message_set_iter(gz): yield (offset, msg) - elif codec == KafkaProtocol.CODEC_SNAPPY: + elif codec == CODEC_SNAPPY: snp = snappy_decode(value) for (offset, msg) in KafkaProtocol._decode_message_set_iter(snp): yield (offset, msg) @@ -544,7 +545,7 @@ def create_gzip_message(payloads, key=None): [create_message(payload) for payload in payloads]) gzipped = gzip_encode(message_set) - codec = KafkaProtocol.ATTRIBUTE_CODEC_MASK & KafkaProtocol.CODEC_GZIP + codec = ATTRIBUTE_CODEC_MASK & CODEC_GZIP return Message(0, 0x00 | codec, key, gzipped) @@ -565,6 +566,6 @@ def create_snappy_message(payloads, key=None): [create_message(payload) for payload in payloads]) snapped = snappy_encode(message_set) - codec = KafkaProtocol.ATTRIBUTE_CODEC_MASK & KafkaProtocol.CODEC_SNAPPY + codec = ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY return Message(0, 0x00 | codec, key, snapped) diff --git a/test/test_unit.py b/test/test_unit.py index 8c0dd004f..334ff9704 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -135,8 +135,8 @@ def test_create_gzip(self): payloads = ["v1", "v2"] msg = create_gzip_message(payloads) self.assertEqual(msg.magic, 0) - self.assertEqual(msg.attributes, KafkaProtocol.ATTRIBUTE_CODEC_MASK & - KafkaProtocol.CODEC_GZIP) + self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & + CODEC_GZIP) self.assertEqual(msg.key, None) # Need to decode to check since gzipped payload is non-deterministic decoded = gzip_decode(msg.value) @@ -151,8 +151,8 @@ def test_create_snappy(self): payloads = ["v1", "v2"] msg = create_snappy_message(payloads) self.assertEqual(msg.magic, 0) - self.assertEqual(msg.attributes, KafkaProtocol.ATTRIBUTE_CODEC_MASK & - KafkaProtocol.CODEC_SNAPPY) + self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & + CODEC_SNAPPY) self.assertEqual(msg.key, None) expect = ("8\x00\x00\x19\x01@\x10L\x9f[\xc2\x00\x00\xff\xff\xff\xff" "\x00\x00\x00\x02v1\x19\x1bD\x00\x10\xd5\x96\nx\x00\x00\xff" From 805b52a34da9ce0dead80a64d7315412f2034673 Mon Sep 17 00:00:00 2001 From: Patrick Lucas Date: Wed, 7 May 2014 10:02:57 -0700 Subject: [PATCH 2/2] Improve error handling and tests w.r.t. codecs Add function kafka.protocol.create_message_set() that takes a list of payloads and a codec and returns a message set with the desired encoding. Introduce kafka.common.UnsupportedCodecError, raised if an unknown codec is specified. Include a test for the new function. --- kafka/common.py | 6 ++++ kafka/producer.py | 29 ++++++-------------- kafka/protocol.py | 19 ++++++++++++- test/test_protocol.py | 64 +++++++++++++++++++++++++++++++++++++++---- 4 files changed, 91 insertions(+), 27 deletions(-) diff --git a/kafka/common.py b/kafka/common.py index d5155321e..209754b6a 100644 --- a/kafka/common.py +++ b/kafka/common.py @@ -170,6 +170,11 @@ class ConsumerNoMoreData(KafkaError): class ProtocolError(KafkaError): pass + +class UnsupportedCodecError(KafkaError): + pass + + kafka_errors = { -1 : UnknownError, 1 : OffsetOutOfRangeError, @@ -187,6 +192,7 @@ class ProtocolError(KafkaError): 13 : StaleLeaderEpochCodeError, } + def check_error(response): error = kafka_errors.get(response.error) if error: diff --git a/kafka/producer.py b/kafka/producer.py index 9ecb34155..8e40be5b1 100644 --- a/kafka/producer.py +++ b/kafka/producer.py @@ -9,12 +9,11 @@ from itertools import cycle from multiprocessing import Queue, Process -from kafka.common import ProduceRequest, TopicAndPartition -from kafka.partitioner import HashedPartitioner -from kafka.protocol import ( - CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, ALL_CODECS, - create_message, create_gzip_message, create_snappy_message, +from kafka.common import ( + ProduceRequest, TopicAndPartition, UnsupportedCodecError ) +from kafka.partitioner import HashedPartitioner +from kafka.protocol import CODEC_NONE, ALL_CODECS, create_message_set log = logging.getLogger("kafka") @@ -66,13 +65,7 @@ def _send_upstream(queue, client, codec, batch_time, batch_size, # Send collected requests upstream reqs = [] for topic_partition, msg in msgset.items(): - if codec == CODEC_GZIP: - messages = [create_gzip_message(msg)] - elif codec == CODEC_SNAPPY: - messages = [create_snappy_message(msg)] - else: - messages = [create_message(m) for m in msg] - + messages = create_message_set(msg, codec) req = ProduceRequest(topic_partition.topic, topic_partition.partition, messages) @@ -132,7 +125,9 @@ def __init__(self, client, async=False, if codec is None: codec = CODEC_NONE - assert codec in ALL_CODECS + elif codec not in ALL_CODECS: + raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) + self.codec = codec if self.async: @@ -159,13 +154,7 @@ def send_messages(self, topic, partition, *msg): self.queue.put((TopicAndPartition(topic, partition), m)) resp = [] else: - if self.codec == CODEC_GZIP: - messages = [create_gzip_message(msg)] - elif self.codec == CODEC_SNAPPY: - messages = [create_snappy_message(msg)] - else: - messages = [create_message(m) for m in msg] - + messages = create_message_set(msg, self.codec) req = ProduceRequest(topic, partition, messages) try: resp = self.client.send_produce_request([req], acks=self.req_acks, diff --git a/kafka/protocol.py b/kafka/protocol.py index 730ae6b0a..58661c7fd 100644 --- a/kafka/protocol.py +++ b/kafka/protocol.py @@ -9,7 +9,8 @@ BrokerMetadata, PartitionMetadata, Message, OffsetAndMessage, ProduceResponse, FetchResponse, OffsetResponse, OffsetCommitResponse, OffsetFetchResponse, ProtocolError, - BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall + BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall, + UnsupportedCodecError ) from kafka.util import ( read_short_string, read_int_string, relative_unpack, @@ -568,3 +569,19 @@ def create_snappy_message(payloads, key=None): codec = ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY return Message(0, 0x00 | codec, key, snapped) + + +def create_message_set(messages, codec=CODEC_NONE): + """Create a message set using the given codec. + + If codec is CODEC_NONE, return a list of raw Kafka messages. Otherwise, + return a list containing a single codec-encoded message. + """ + if codec == CODEC_NONE: + return [create_message(m) for m in messages] + elif codec == CODEC_GZIP: + return [create_gzip_message(messages)] + elif codec == CODEC_SNAPPY: + return [create_snappy_message(messages)] + else: + raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) diff --git a/test/test_protocol.py b/test/test_protocol.py index 854a4396a..2089f48d7 100644 --- a/test/test_protocol.py +++ b/test/test_protocol.py @@ -1,23 +1,30 @@ +import contextlib +from contextlib import contextmanager import struct import unittest2 +import mock +from mock import sentinel + from kafka import KafkaClient from kafka.common import ( OffsetRequest, OffsetCommitRequest, OffsetFetchRequest, OffsetResponse, OffsetCommitResponse, OffsetFetchResponse, ProduceRequest, FetchRequest, Message, ChecksumError, - ConsumerFetchSizeTooSmall, ProduceResponse, FetchResponse, - OffsetAndMessage, BrokerMetadata, PartitionMetadata, - TopicAndPartition, KafkaUnavailableError, ProtocolError, - LeaderUnavailableError, PartitionUnavailableError + ConsumerFetchSizeTooSmall, ProduceResponse, FetchResponse, OffsetAndMessage, + BrokerMetadata, PartitionMetadata, TopicAndPartition, KafkaUnavailableError, + ProtocolError, LeaderUnavailableError, PartitionUnavailableError, + UnsupportedCodecError ) from kafka.codec import ( has_snappy, gzip_encode, gzip_decode, snappy_encode, snappy_decode ) +import kafka.protocol from kafka.protocol import ( - create_gzip_message, create_message, create_snappy_message, KafkaProtocol, - ATTRIBUTE_CODEC_MASK, CODEC_GZIP, CODEC_SNAPPY + ATTRIBUTE_CODEC_MASK, CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, KafkaProtocol, + create_message, create_gzip_message, create_snappy_message, + create_message_set ) class TestProtocol(unittest2.TestCase): @@ -691,3 +698,48 @@ def test_decode_offset_fetch_response(self): OffsetFetchResponse(topic = 'topic1', partition = 2, offset = 4, error = 0, metadata = "meta"), OffsetFetchResponse(topic = 'topic1', partition = 4, offset = 8, error = 0, metadata = "meta"), ])) + + @contextmanager + def mock_create_message_fns(self): + patches = contextlib.nested( + mock.patch.object(kafka.protocol, "create_message", + return_value=sentinel.message), + mock.patch.object(kafka.protocol, "create_gzip_message", + return_value=sentinel.gzip_message), + mock.patch.object(kafka.protocol, "create_snappy_message", + return_value=sentinel.snappy_message), + ) + + with patches: + yield + + def test_create_message_set(self): + messages = [1, 2, 3] + + # Default codec is CODEC_NONE. Expect list of regular messages. + expect = [sentinel.message] * len(messages) + with self.mock_create_message_fns(): + message_set = create_message_set(messages) + self.assertEqual(message_set, expect) + + # CODEC_NONE: Expect list of regular messages. + expect = [sentinel.message] * len(messages) + with self.mock_create_message_fns(): + message_set = create_message_set(messages, CODEC_NONE) + self.assertEqual(message_set, expect) + + # CODEC_GZIP: Expect list of one gzip-encoded message. + expect = [sentinel.gzip_message] + with self.mock_create_message_fns(): + message_set = create_message_set(messages, CODEC_GZIP) + self.assertEqual(message_set, expect) + + # CODEC_SNAPPY: Expect list of one snappy-encoded message. + expect = [sentinel.snappy_message] + with self.mock_create_message_fns(): + message_set = create_message_set(messages, CODEC_SNAPPY) + self.assertEqual(message_set, expect) + + # Unknown codec should raise UnsupportedCodecError. + with self.assertRaises(UnsupportedCodecError): + create_message_set(messages, -1)