protocol buffer编码规则
更新日期:
这段时间想把protocol buffer用在移动平台上,这时就发现它还是有点庞大的,不易编译或者移植,所以想着了解下它的编解码方式,看看是否能实现一个简单的编码库。看了源码之后就发现,其实虽然protocol buffer支持的数据类型有很多,但是底层的编码方式也只有6种,编解码部分还是很明了的。
protocol buffer对每个field编码时,先编码一个uint32类型的tag(tag=fieldnumber<<3|WireType),接着再编码此field的内容,如此循环往复,直至把所有field编码完。tag是按照uint32类型编码的,filed内容按照其类型分别编码,编码方式如下:
uint32
对应于protocol buffer中类型uint32,sint32和bool,编码方式如下
1
2
3
4
5
6
7
8
9
10 INL static uint8* WriteVarint32ToArray(uint32 value, uint8* target)
{
while (value >= 0x80) {
*target = static_cast<uint8>(value | 0x80);
value >>= 7;
++target;
}
*target = static_cast<uint8>(value);
return target + 1;
}
特别的,对于sint32先进行(static_cast
uint64
对应于protocol buffer中类型int64,uint64,sint64,代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 INL static uint8* WriteVarint64ToArrayInline(uint64 value, uint8* target)
{
// Splitting into 32-bit pieces gives better performance on 32-bit
// processors.
uint32 part0 = static_cast<uint32>(value );
uint32 part1 = static_cast<uint32>(value >> 28);
uint32 part2 = static_cast<uint32>(value >> 56);
int size;
if (part2 == 0) {
if (part1 == 0) {
if (part0 < (1 << 14)) {
if (part0 < (1 << 7)) {
size = 1; goto size1;
} else {
size = 2; goto size2;
}
} else {
if (part0 < (1 << 21)) {
size = 3; goto size3;
} else {
size = 4; goto size4;
}
}
} else {
if (part1 < (1 << 14)) {
if (part1 < (1 << 7)) {
size = 5; goto size5;
} else {
size = 6; goto size6;
}
} else {
if (part1 < (1 << 21)) {
size = 7; goto size7;
} else {
size = 8; goto size8;
}
}
}
} else {
if (part2 < (1 << 7)) {
size = 9; goto size9;
} else {
size = 10; goto size10;
}
}
//GOOGLE_LOG(FATAL) << "Can't get here.";
size10: target[9] = static_cast<uint8>((part2 >> 7) | 0x80);
size9 : target[8] = static_cast<uint8>((part2 ) | 0x80);
size8 : target[7] = static_cast<uint8>((part1 >> 21) | 0x80);
size7 : target[6] = static_cast<uint8>((part1 >> 14) | 0x80);
size6 : target[5] = static_cast<uint8>((part1 >> 7) | 0x80);
size5 : target[4] = static_cast<uint8>((part1 ) | 0x80);
size4 : target[3] = static_cast<uint8>((part0 >> 21) | 0x80);
size3 : target[2] = static_cast<uint8>((part0 >> 14) | 0x80);
size2 : target[1] = static_cast<uint8>((part0 >> 7) | 0x80);
size1 : target[0] = static_cast<uint8>((part0 ) | 0x80);
target[size-1] &= 0x7F;
return target + size;
}
int32
对应于protocol buffer中类型int32,代码如下
1
2
3
4
5
6
7
8 INL static uint8* WriteVarint32SignExtendedToArray(int32 value, uint8* target)
{
if (value < 0) {
return WriteVarint64ToArrayInline(static_cast<uint64>(value), target);
} else {
return WriteVarint32ToArray(static_cast<uint32>(value), target);
}
}
float
对应于protocol buffer中类型Fixed32,SFixed32,Float,代码如下
1
2
3
4
5
6
7
8
9
10
11
12 INL static uint8* WriteLittleEndian32ToArray(uint32 value, uint8* target)
{
#if defined(PROTOBUF_LITTLE_ENDIAN)
memcpy(target, &value, sizeof(value));
#else
target[0] = static_cast<uint8>(value);
target[1] = static_cast<uint8>(value >> 8);
target[2] = static_cast<uint8>(value >> 16);
target[3] = static_cast<uint8>(value >> 24);
#endif
return target + sizeof(value);
}
特别的对于float,经过union {float f; uint32 i;};f = value;return i;转换后编码。
double
对应于protocol buffer中类型Fixed64,SFixed64,Double,代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 INL static uint8* WriteLittleEndian64ToArray(uint64 value, uint8* target)
{
#if defined(PROTOBUF_LITTLE_ENDIAN)
memcpy(target, &value, sizeof(value));
#else
uint32 part0 = static_cast<uint32>(value);
uint32 part1 = static_cast<uint32>(value >> 32);
target[0] = static_cast<uint8>(part0);
target[1] = static_cast<uint8>(part0 >> 8);
target[2] = static_cast<uint8>(part0 >> 16);
target[3] = static_cast<uint8>(part0 >> 24);
target[4] = static_cast<uint8>(part1);
target[5] = static_cast<uint8>(part1 >> 8);
target[6] = static_cast<uint8>(part1 >> 16);
target[7] = static_cast<uint8>(part1 >> 24);
#endif
return target + sizeof(value);
}
特别的对于double,经过union {double f; uint64 i;};f = value;return i;转换后编码。
string
对应于protocol buffer中类型string,message,代码如下
1
2
3
4
5 INL static uint8* WriteRawToArray(const void* buffer, int size, uint8* target)
{
memcpy(target, buffer, size);
return target + size;
}
对于这种复杂结构protocol buffer先写入tag,然后写入内容长度,最后写入编码内容。
特别的对于message类型,先把其编码成string,然后再按string类型编码进其宿主message。
repeated(simple)
对应于protocol buffer中所有非string,message类型的repeated,编码如下:
1
2
3
4
5 WireFormatLite::WriteTagToArray(field_number_arg, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, target);
io::CodedOutputStream::WriteVarint32ToArray(_lens_cached_byte_size_, target);
for(int i = 0; i < value.count; ++i){
WriteNoTagDataToArray(value[i], target);
}
其中_lens_cached_bytesize表示repeated中所有内容编码后的长度。
repeated(string,message)
对应于protocol buffer中string,message类型的repeated,编码如下:
1
2
3
4
5 for(int i = 0; i < value.count; ++i){
target = WireFormatLite::WriteTagToArray(field_number_arg, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, target);
target = io::CodedOutputStream::WriteVarint32ToArray(value[i]._lens_cached_byte_size_, target);
target = io::CodedOutputStream::WriteRawToArray(value[i].buf, value[i]._lens_cached_byte_size_, target);
}
这是把repeated当成单独的多个独立成员编码的。
_lens_cached_bytesize表示string的长度或者message编码后的长度,tempbuf即string的内容或者message编码后的内容。
map
对应于protocol buffer中map类型
之前博文中已经讲过protocol buffer中的map实际上是一repeated message,此message包含两个成员,fieldnumber=1的key的fieldnumber=2的value,所以其编码就是按照repeated(string,message)的编码方式编码的。