IDA反编译然后解密数据库并导出json的部分参见:QQDecrypt
这里谈一谈如何将导出的json转换为人类可读的文本,由于项目需要和精力有限,仅研究纯文本内容的转换,xml类型的消息进行了部分探究,不在这里展开。
导出的数据格式如下:
[
{
"40001": 7355779890493674288,
"40002": 0,
"40003": 180810,
"40005": 0,
"40006": 0,
"40010": 2,
"40011": 5,
"40012": 4,
"40013": 0,
"40020": "u_8xkqJFcVLW6q2fNXIP8Yyw",
"40021": "577507458",
"40026": 675,
"40027": 577507458,
"40030": 0,
"40033": 0,
"40040": 0,
"40041": 2,
"40050": 1712116828,
"40052": 0,
"40058": 1712073600,
"40060": 0,
"40062": null,
"40083": 0,
"40084": 0,
"40090": "",
"40093": "",
"40100": 0,
"40105": 0,
"40600": "wukTCJjOFNf107AG",
"40601": null,
"40605": null,
"40800": "gvYTb8j8FbGm2dbn2r6KZtD8FQjY/BUBsKUXALqlFxh1Xzh4a3FKRmNWTFc2cTJmTlhJUDhZeXfCpRcYdV84eGtxSkZjVkxXNnEyZk5YSVA4WXl3iqYXG+S9oOeMnOeMnOaSpOWbnuS6huS7gOS5iOOAgg==",
"40801": null,
"40850": 0,
"40851": 0,
"40900": null
},
...
]
40800
是聊天内容字段,base64编码特征的字段便是纯文本消息。
首先想的肯定是进行解码:
‚öTÈü³¦ÙÖçÚ¾ŠfÐüØü°¥�º¥u_5bO_Lm-rqqd0_4DyuQJGuQÂ¥u_5bO_Lm-rqqd0_4DyuQJGuQŠ¦�
结果显然不是我们想要的,推断为二进制编码。
当然,QQDecrypt已经告诉我们这是protobuf 格式的消息,但是笔者不知道qq使用的proto格式,因此只能继续探究。
这里写一个脚本来分析一下内容,查看十六进制编码:
def analyze_binary(base64_content):
# base64解码
decoded = base64.b64decode(base64_content)
print(f"Total length: {len(decoded)} bytes")
# 打印完整的十六进制内容
hex_str = binascii.hexlify(decoded).decode('ascii')
bytes_list = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)]
print("\nFull hex content:")
for i in range(0, len(bytes_list), 16):
chunk = bytes_list[i:i+16]
# 打印位置
print(f"{i:04x}: ", end='')
# 打印十六进制
print(" ".join(chunk), end='')
# 补充空格
print(" " * (16 - len(chunk)), end='')
# 打印ASCII
ascii_repr = "".join(chr(int(x, 16)) if 32 <= int(x, 16) <= 126 else '.'
for x in chunk)
print(f" |{ascii_repr}|")
# 分析头部结构
print("\nHeader analysis:")
print(f"First 3 bytes: {' '.join(bytes_list[:3])}")
# 分析后续内容
print("\nContent after header:")
content_bytes = bytes_list[3:]
print(" ".join(content_bytes))
# 尝试GBK解码后续内容
try:
content_hex = "".join(content_bytes)
content_binary = bytes.fromhex(content_hex)
print("\nTrying GBK decode on content:")
print(content_binary.decode('gbk', errors='ignore'))
except Exception as e:
print(f"\nGBK decode failed: {str(e)}")
# 测试解析短消息
print("Analyzing short message:")
short_message = "gvYTHsj8Fb/L5KTDuu//ZtD8FQHqghYG5a6M5ZWm8IIWAA=="
analyze_binary(short_message)
结果:
Total length: 34 bytes
Full hex content:
0000: 82 f6 13 1e c8 fc 15 bf cb e4 a4 c3 ba ef ff 66 |...............f|
0010: d0 fc 15 01 ea 82 16 06 e5 ae 8c e5 95 a6 f0 82 |................|
0020: 16 00 |..|
Header analysis:
First 3 bytes: 82 f6 13
Content after header:
1e c8 fc 15 bf cb e4 a4 c3 ba ef ff 66 d0 fc 15 01 ea 82 16 06 e5 ae 8c e5 95 a6 f0 82 16 00
Trying GBK decode on content:
赛克浃煤f悬陚瀹屽暒饌
虽然gbk解码后不是有效的内容,但我们通过观察不难看出末尾的e5 ae 8c e5 95 a6
证实UTF-8编码下的中文 完啦
之后我测试了多条消息,显示出如下特征:
- 消息均具有相同的头部和尾部标记:
82 f6 13
和f0 82 16 00
- 结尾标记之前一直到头部标记后的某个字节均可以进行有效UTF-8编码
于是我写了如下脚本测试:
import base64
def decode_text_message(base64_content):
try:
# 1. base64解码
data = base64.b64decode(base64_content)
# 2. 找到结尾标记的位置
end_marker = b'\xf0\x82\x16\x00'
text_end = data.rfind(end_marker)
if text_end == -1:
return None
# 3. 从固定头部后开始查找UTF-8文本
start_pos = data.rfind(b'\xea\x82\x16') # 找到文本开始的标记
if start_pos != -1:
text_bytes = data[start_pos+3:text_end]
text = text_bytes.decode('utf-8')
return text
return None
except Exception as e:
print(f"解析错误: {e}")
return None
# 测试
test_messages = [
"gvYTJMj8FdvB9eeFw5eqZtD8FQHqghYM5aW955So54ix55So8IIWAA==", # "爱用好用"
"gvYTLcj8FYPP8MPGvJeqZtD8FQHqghYV5LiN6LazNS415bCx6IO95YiG5byA8IIWAA==", # "不足5.5就能分开"
"gvYTG8j8FY3Uz472uJeqZtD8FQHqghYD5a+58IIWAA==", # "对"
"gvYTO8j8Faqv6fn0p5eqZtD8FQHqghYj5pio5aSp5p+g5qqs5rC0NOWdlyDku4rlpKnlj6/kuZAxLjXwghYA" # "昨天柠檬水4块 今天可乐1.5"
]
for i, msg in enumerate(test_messages, 1):
text = decode_text_message(msg)
print(f"消息 {i}: {text}")
结果:
消息 1: 好用爱用
消息 2: 不足5.5就能分开
消息 3: 对
消息 4: #昨天柠檬水4块 今天可乐1.5
确实成功的提取了消息,但前部有多余字符。之后进行偏移量的尝试,发现只需要偏移一个字节即可:
def decode_text_message(base64_content):
try:
# 1. base64解码
data = base64.b64decode(base64_content)
# 2. 找到结尾标记的位置
end_marker = b'\xf0\x82\x16\x00'
text_end = data.rfind(end_marker)
if text_end == -1:
return None
# 3. 从固定头部后开始查找UTF-8文本
start_pos = data.rfind(b'\xea\x82\x16') # 找到文本开始的标记
if start_pos != -1:
# 跳过一个字节
text_bytes = data[start_pos+4:text_end]
text = text_bytes.decode('utf-8')
return text
return None
except Exception as e:
print(f"解析错误: {e}")
return None
test_messages = [
"gvYTJMj8FdvB9eeFw5eqZtD8FQHqghYM5aW955So54ix55So8IIWAA==", # "好用爱用"
"gvYTLcj8FYPP8MPGvJeqZtD8FQHqghYV5LiN6LazNS415bCx6IO95YiG5byA8IIWAA==", # "不足5.5就能分开"
"gvYTG8j8FY3Uz472uJeqZtD8FQHqghYD5a+58IIWAA==", # "对"
"gvYTO8j8Faqv6fn0p5eqZtD8FQHqghYj5pio5aSp5p+g5qqs5rC0NOWdlyDku4rlpKnlj6/kuZAxLjXwghYA" # "昨天柠檬水4块 今天可乐1.5"
]
for i, msg in enumerate(test_messages, 1):
text = decode_text_message(msg)
print(f"消息 {i}: {text}")
结果正常显示所有文本。
Appenix: xml消息的十六编码示例
Total length: 379 bytes
Full hex content:
0000: 82 f6 13 f2 06 d0 c4 13 02 c8 fc 15 9a dc e8 d1 |................|
0010: a1 f1 84 80 67 d0 fc 15 10 ca dd 17 40 72 79 6b |....g.......@ryk|
0020: 69 38 43 49 58 53 71 2b 53 45 2b 7a 4a 54 47 44 |i8CIXSq+SE+zJTGD|
0030: 53 38 32 68 72 45 4d 2b 44 43 74 68 43 4a 30 39 |S82hrEM+DCthCJ09|
0040: 48 43 58 30 44 6a 55 6e 59 39 30 66 33 69 30 46 |HCX0DjUnY90f3i0F|
0050: 34 51 53 4b 6f 30 69 59 2f 6f 69 6d 58 d2 dd 17 |4QSKo0iY/oimX...|
0060: e0 05 3c 3f 78 6d 6c 20 76 65 72 73 69 6f 6e 3d |..<?xml version=|
0070: 27 31 2e 30 27 20 65 6e 63 6f 64 69 6e 67 3d 27 |'1.0' encoding='|
0080: 55 54 46 2d 38 27 20 73 74 61 6e 64 61 6c 6f 6e |UTF-8' standalon|
0090: 65 3d 22 79 65 73 22 3f 3e 20 3c 6d 73 67 20 73 |e="yes"?> <msg s|
00a0: 65 72 76 69 63 65 49 44 3d 22 33 35 22 20 74 65 |erviceID="35" te|
00b0: 6d 70 6c 61 74 65 49 44 3d 22 31 22 20 61 63 74 |mplateID="1" act|
00c0: 69 6f 6e 3d 22 76 69 65 77 4d 75 6c 74 69 4d 73 |ion="viewMultiMs|
00d0: 67 22 20 62 72 69 65 66 3d 22 5b e8 81 8a e5 a4 |g" brief="[.....|
00e0: a9 e8 ae b0 e5 bd 95 5d 22 20 6d 5f 66 69 6c 65 |.......]" m_file|
00f0: 4e 61 6d 65 3d 22 34 30 35 36 32 63 66 38 2d 65 |Name="40562cf8-e|
0100: 36 61 61 2d 34 64 37 33 2d 38 30 63 62 2d 66 37 |6aa-4d73-80cb-f7|
0110: 32 34 33 64 61 66 38 62 36 33 22 20 6d 5f 72 65 |243daf8b63" m_re|
0120: 73 69 64 3d 22 72 79 6b 69 38 43 49 58 53 71 2b |sid="ryki8CIXSq+|
0130: 53 45 2b 7a 4a 54 47 44 53 38 32 68 72 45 4d 2b |SE+zJTGDS82hrEM+|
0140: 44 43 74 68 43 4a 30 39 48 43 58 30 44 6a 55 6e |DCthCJ09HCX0DjUn|
0150: 59 39 30 66 33 69 30 46 34 51 53 4b 6f 30 69 59 |Y90f3i0F4QSKo0iY|
0160: 2f 6f 69 6d 58 22 20 74 53 75 6d 3d 22 32 37 38 |/oimX" tSum="278|
0170: 22 20 66 6c 61 67 3d 22 33 22 3e |" flag="3">|
Header analysis:
First 3 bytes: 82 f6 13
Content after header:
f2 06 d0 c4 13 02 c8 fc 15 9a dc e8 d1 a1 f1 84 80 67 d0 fc 15 10 ca dd 17 40 72 79 6b 69 38 43 49 58 53 71 2b 53 45 2b 7a 4a 54 47 44 53 38 32 68 72 45 4d 2b 44 43 74 68 43 4a 30 39 48 43 58 30 44 6a 55 6e 59 39 30 66 33 69 30 46 34 51 53 4b 6f 30 69 59 2f 6f 69 6d 58 d2 dd 17 e0 05 3c 3f 78 6d 6c 20 76 65 72 73 69 6f 6e 3d 27 31 2e 30 27 20 65 6e 63 6f 64 69 6e 67 3d 27 55 54 46 2d 38 27 20 73 74 61 6e 64 61 6c 6f 6e 65 3d 22 79 65 73 22 3f 3e 20 3c 6d 73 67 20 73 65 72 76 69 63 65 49 44 3d 22 33 35 22 20 74 65 6d 70 6c 61 74 65 49 44 3d 22 31 22 20 61 63 74 69 6f 6e 3d 22 76 69 65 77 4d 75 6c 74 69 4d 73 67 22 20 62 72 69 65 66 3d 22 5b e8 81 8a e5 a4 a9 e8 ae b0 e5 bd 95 5d 22 20 6d 5f 66 69 6c 65 4e 61 6d 65 3d 22 34 30 35 36 32 63 66 38 2d 65 36 61 61 2d 34 64 37 33 2d 38 30 63 62 2d 66 37 32 34 33 64 61 66 38 62 36 33 22 20 6d 5f 72 65 73 69 64 3d 22 72 79 6b 69 38 43 49 58 53 71 2b 53 45 2b 7a 4a 54 47 44 53 38 32 68 72 45 4d 2b 44 43 74 68 43 4a 30 39 48 43 58 30 44 6a 55 6e 59 39 30 66 33 69 30 46 34 51 53 4b 6f 30 69 59 2f 6f 69 6d 58 22 20 74 53 75 6d 3d 22 32 37 38 22 20 66 6c 61 67 3d 22 33 22 3e
Trying GBK decode on content:
心赛氒柩●剙g悬瘦@ryki8CIXSq+SE+zJTGDS82hrEM+DCthCJ09HCX0DjUnY90f3i0F4QSKo0iY/oimX逸<?xml version='1.0' encoding='UTF-8' standalone="yes"?> <msg serviceID="35" templateID="1" action="viewMultiMsg" brief="[鑱婂ぉ璁板綍]" m_fileName="40562cf8-e6aa-4d73-80cb-f7243daf8b63" m_resid="ryki8CIXSq+SE+zJTGDS82hrEM+DCthCJ09HCX0DjUnY90f3i0F4QSKo0iY/oimX" tSum="278" flag="3">
不难看出这是一个标准的xml标记,提取后部内容正常解析即可。
特别鸣谢 @Faspand 导出的聊天记录。
Comments | NOTHING