protocol buffer编码规则

文章目錄

1. uint32
2. uint64
3. int32
4. float
5. double
6. string
7. repeated(simple)
8. repeated(string,message)
9. map

这段时间想把protocol buffer用在移动平台上，这时就发现它还是有点庞大的，不易编译或者移植，所以想着了解下它的编解码方式，看看是否能实现一个简单的编码库。看了源码之后就发现，其实虽然protocol buffer支持的数据类型有很多，但是底层的编码方式也只有6种，编解码部分还是很明了的。
protocol buffer对每个field编码时，先编码一个uint32类型的tag(tag=fieldnumber<<3|WireType)，接着再编码此field的内容，如此循环往复，直至把所有field编码完。tag是按照uint32类型编码的，filed内容按照其类型分别编码，编码方式如下：

uint32

对应于protocol buffer中类型uint32,sint32和bool，编码方式如下

INL static uint8* WriteVarint32ToArray(uint32 value, uint8* target)
{
	while (value >= 0x80) {
		*target = static_cast<uint8>(value | 0x80);
		value >>= 7;
		++target;
	}
	*target = static_cast<uint8>(value);
	return target + 1;
}

特别的，对于sint32先进行(static_cast(n) << 1) ^ (n >> 31)再编码，对于bool进行value ? 1 : 0转换后再编码。

uint64

对应于protocol buffer中类型int64,uint64,sint64,代码如下

	INL static uint8* WriteVarint64ToArrayInline(uint64 value, uint8* target)
	{
		// Splitting into 32-bit pieces gives better performance on 32-bit
		// processors.
		uint32 part0 = static_cast<uint32>(value      );
		uint32 part1 = static_cast<uint32>(value >> 28);
		uint32 part2 = static_cast<uint32>(value >> 56);

		int size;

		if (part2 == 0) {
			if (part1 == 0) {
				if (part0 < (1 << 14)) {
					if (part0 < (1 << 7)) {
						size = 1; goto size1;
					} else {
						size = 2; goto size2;
					}
				} else {
					if (part0 < (1 << 21)) {
						size = 3; goto size3;
					} else {
						size = 4; goto size4;
					}
				}
			} else {
				if (part1 < (1 << 14)) {
					if (part1 < (1 << 7)) {
						size = 5; goto size5;
					} else {
						size = 6; goto size6;
					}
				} else {
					if (part1 < (1 << 21)) {
						size = 7; goto size7;
					} else {
						size = 8; goto size8;
					}
				}
			}
		} else {
			if (part2 < (1 << 7)) {
				size = 9; goto size9;
			} else {
				size = 10; goto size10;
			}
		}

		//GOOGLE_LOG(FATAL) << "Can't get here.";

size10: target[9] = static_cast<uint8>((part2 >>  7) | 0x80);
size9 : target[8] = static_cast<uint8>((part2      ) | 0x80);
size8 : target[7] = static_cast<uint8>((part1 >> 21) | 0x80);
size7 : target[6] = static_cast<uint8>((part1 >> 14) | 0x80);
size6 : target[5] = static_cast<uint8>((part1 >>  7) | 0x80);
size5 : target[4] = static_cast<uint8>((part1      ) | 0x80);
size4 : target[3] = static_cast<uint8>((part0 >> 21) | 0x80);
size3 : target[2] = static_cast<uint8>((part0 >> 14) | 0x80);
size2 : target[1] = static_cast<uint8>((part0 >>  7) | 0x80);
size1 : target[0] = static_cast<uint8>((part0      ) | 0x80);

		target[size-1] &= 0x7F;
		return target + size;
	}

int32

对应于protocol buffer中类型int32,代码如下

INL static uint8* WriteVarint32SignExtendedToArray(int32 value, uint8* target)
{
	if (value < 0) {
		return WriteVarint64ToArrayInline(static_cast<uint64>(value), target);
	} else {
		return WriteVarint32ToArray(static_cast<uint32>(value), target);
	}
}

float

对应于protocol buffer中类型Fixed32,SFixed32,Float,代码如下

	INL static uint8* WriteLittleEndian32ToArray(uint32 value, uint8* target)
	{
#if defined(PROTOBUF_LITTLE_ENDIAN)
		memcpy(target, &value, sizeof(value));
#else
		target[0] = static_cast<uint8>(value);
		target[1] = static_cast<uint8>(value >>  8);
		target[2] = static_cast<uint8>(value >> 16);
		target[3] = static_cast<uint8>(value >> 24);
#endif
		return target + sizeof(value);
	}

特别的对于float，经过union {float f; uint32 i;};f = value;return i;转换后编码。

double

对应于protocol buffer中类型Fixed64,SFixed64,Double,代码如下

	INL static uint8* WriteLittleEndian64ToArray(uint64 value, uint8* target)
	{
#if defined(PROTOBUF_LITTLE_ENDIAN)
		memcpy(target, &value, sizeof(value));
#else
		uint32 part0 = static_cast<uint32>(value);
		uint32 part1 = static_cast<uint32>(value >> 32);

		target[0] = static_cast<uint8>(part0);
		target[1] = static_cast<uint8>(part0 >>  8);
		target[2] = static_cast<uint8>(part0 >> 16);
		target[3] = static_cast<uint8>(part0 >> 24);
		target[4] = static_cast<uint8>(part1);
		target[5] = static_cast<uint8>(part1 >>  8);
		target[6] = static_cast<uint8>(part1 >> 16);
		target[7] = static_cast<uint8>(part1 >> 24);
#endif
		return target + sizeof(value);
	}

特别的对于double，经过union {double f; uint64 i;};f = value;return i;转换后编码。

string

对应于protocol buffer中类型string,message,代码如下

INL static uint8* WriteRawToArray(const void* buffer, int size, uint8* target)
{
	memcpy(target, buffer, size);
	return target + size;
}

对于这种复杂结构protocol buffer先写入tag，然后写入内容长度，最后写入编码内容。
特别的对于message类型，先把其编码成string，然后再按string类型编码进其宿主message。

repeated(simple)

对应于protocol buffer中所有非string,message类型的repeated,编码如下:

WireFormatLite::WriteTagToArray(field_number_arg, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, target);
io::CodedOutputStream::WriteVarint32ToArray(_lens_cached_byte_size_, target);
for(int i = 0; i < value.count; ++i){
			WriteNoTagDataToArray(value[i], target);
			}

其中_lens_cached_bytesize表示repeated中所有内容编码后的长度。

repeated(string,message)

对应于protocol buffer中string,message类型的repeated,编码如下:

for(int i = 0; i < value.count; ++i){
                                target = WireFormatLite::WriteTagToArray(field_number_arg, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, target);
			target = io::CodedOutputStream::WriteVarint32ToArray(value[i]._lens_cached_byte_size_, target);
			target = io::CodedOutputStream::WriteRawToArray(value[i].buf, value[i]._lens_cached_byte_size_, target);
			}

这是把repeated当成单独的多个独立成员编码的。
_lens_cached_bytesize表示string的长度或者message编码后的长度，tempbuf即string的内容或者message编码后的内容。

map

对应于protocol buffer中map类型
之前博文中已经讲过protocol buffer中的map实际上是一repeated message，此message包含两个成员，fieldnumber=1的key的fieldnumber=2的value，所以其编码就是按照repeated(string,message)的编码方式编码的。