[已解决] 一个封包算法的解析求助-软件逆向-看雪-安全社区|安全招聘|kanxue.com

最新回复 (9)
萌克力雪币： 433 活跃值： (1895) 能力值： ( LV4，RANK：40 ) 在线值：发帖 35 回帖 607 粉丝 32 关注私信	萌克力 2 楼为何放弃治疗？友情提示:bp send 跟包找上层加密函数自己写解密 2013-7-28 20:33 0
lwykj 雪币： 870 活跃值： (1033) 能力值： ( LV2，RANK：10 ) 在线值：发帖 6 回帖 240 粉丝 0 关注私信	lwykj 3 楼 �'`* �6y�/� �"小"冠"衣"热"公"虹"秋"野"遁"雾"火"猴"肉"帅"呜"桃"帮"侠"生"蜜"敢"鲨"八"莉( 不就是utf8么 2013-7-28 22:53 0
Tebox 雪币： 1088 活跃值： (30) 能力值： ( LV3，RANK：20 ) 在线值：发帖 30 回帖 242 粉丝 0 关注私信	Tebox 4 楼从什么转到UTF8? ansi Unicode转都是乱码 2013-7-29 08:50 0
triones 雪币： 517 活跃值： (84) 能力值： ( LV12，RANK：250 ) 在线值：发帖 17 回帖 158 粉丝 1 关注私信	triones 6 5 楼这编码，明显是UTF8。 win7及以上，kernel自带Unicode转UTF8的API，名字忘记了，自求度娘。 win7以下，系统不自带转换函数，需要自己实现。不过好像其它高级语言有转换的库函数，也自求度娘。给你一个片段： static const int gk_utf8_max_byte = 6; //utf8最大占用字节 #pragma warning(push) #pragma warning(disable:4244) //warning C4244: “=”: 从“const unsigned long”转换到“unsigned char”，可能丢失数据 int unicode_byte2utf8_byte(unsigned char* utf8, const unsigned long unicode) { unsigned char tu[gk_utf8_max_byte]; if(utf8 == NULL) utf8 = (unsigned char*)&tu; //0000 0000 0000 0000 0000 0000 0111 1111 7bit //0000 0000 0000 0000 0000 0000 0XXX XXXX 7bit if(unicode < 0x00000080) { utf8[0] = ((unicode & 0x0000007F) >> 0) \| 0x00; return 1; } //0000 0000 0000 0000 0000 0000 10XX XXXX 6bit const unsigned long a = ((unicode & 0x0000003F) >> 0) \| 0x80; //0000 0000 0000 0000 0000 0111 1111 1111 11bit //0000 0000 0000 0000 0011 0XXX XX00 0000 5bit if(unicode < 0x00000800) { utf8[1] = a; utf8[0] = ((unicode & 0x000007C0) >> 6) \| 0xC0; return 2; } //0000 0000 0000 0000 0010 XXXX XX00 0000 6bit const unsigned long b = ((unicode & 0x00000FC0) >> 6) \| 0x80; //0000 0000 0000 0000 1111 1111 1111 1111 16bit //0000 0000 0000 1110 XXXX 0000 0000 0000 4bit if(unicode < 0x00010000) { utf8[2] = a; utf8[1] = b; utf8[0] = ((unicode & 0x0000F000) >> 12) \| 0xE0; return 3; } //0000 0000 0000 10XX XXXX 0000 0000 0000 6bit const unsigned long c = ((unicode & 0x0003F000) >> 12) \| 0x80; //0000 0000 0001 1111 1111 1111 1111 1111 21bit //0000 0011 110X XX00 0000 0000 0000 0000 3bit if(unicode < 0x00200000) { utf8[3] = a; utf8[2] = b; utf8[1] = c; utf8[0] = ((unicode & 0x001C0000) >> 18) \| 0xF0; return 4; } //0000 0010 XXXX XX00 0000 0000 0000 0000 6bit const unsigned long d = ((unicode & 0x00FC0000) >> 18) \| 0x80; //0000 0011 1111 1111 1111 1111 1111 1111 26bit //1111 10XX 0000 0000 0000 0000 0000 0000 2bit if(unicode < 0x04000000) { utf8[4] = a; utf8[3] = b; utf8[2] = c; utf8[1] = d; utf8[0] = ((unicode & 0x03000000) >> 24) \| 0xF8; return 5; } //00XX XXXX 0000 0000 0000 0000 0000 0000 6bit const unsigned long e = ((unicode & 0x3F000000) >> 24) \| 0x80; //0111 1111 1111 1111 1111 1111 1111 1111 31bit //0X00 0000 0000 0000 0000 0000 0000 0000 1bit if(unicode < 0x80000000) { utf8[5] = a; utf8[4] = b; utf8[3] = c; utf8[2] = d; utf8[1] = e; utf8[0] = ((unicode & 0x04000000) >> 30) \| 0xFC; return 6; } return 0; } #pragma warning(pop) 2013-7-29 08:58 0
Tebox 雪币： 1088 活跃值： (30) 能力值： ( LV3，RANK：20 ) 在线值：发帖 30 回帖 242 粉丝 0 关注私信	Tebox 6 楼不对.Unicode转UTF8中文转换后结果是小这个样子的~ 2013-7-29 09:19 0
triones 雪币： 517 活跃值： (84) 能力值： ( LV12，RANK：250 ) 在线值：发帖 17 回帖 158 粉丝 1 关注私信	triones 6 7 楼哥，代码都贴给你了，还不信。不信去问百度http://baike.baidu.com/view/25412.htm 看了utf8编码的定义就可以知道，只要是中文，utf8编码应该都是3 byte，且都以0xE开头。 u5C0F是“小”的Unicode编码，不是UTF8编码 2013-7-29 09:23 0
Tebox 雪币： 1088 活跃值： (30) 能力值： ( LV3，RANK：20 ) 在线值：发帖 30 回帖 242 粉丝 0 关注私信	Tebox 8 楼是的.我看过了. 我想把封包转换成中文.直接套用易语言拿模块测试了下Utf8到Unicode的代码. 可是转换依然未实现中文的效果. 显示全是乱码.. 恕我愚钝..请指教. 2013-7-29 09:43 0
triones 雪币： 517 活跃值： (84) 能力值： ( LV12，RANK：250 ) 在线值：发帖 17 回帖 158 粉丝 1 关注私信	triones 6 9 楼代码给你，有用的自己抠出来。到这地步你还不会，我也没办法了。我看了网上在线转换UTF8的点，都很扯，转换完全不正确。至于易语言，我不知道是不是它的实现不行，还是你的参数不正确。建议去理解一下Unicode编码规范以及UTF-8编码规范。另外，你贴出来的原始封包，并不是全是UTF-8，只有中间一段文字是UTF-8，不能全转换 ws_utf8.h /! \file ws_utf8.h \brief ws_utf8.h用于如UTF8与UNICODE相互转换 \section ws_utf8 ver 1.1.1305.2510 (For All) - \b 2013-03-07 新增unicode与utf8编码的转换。\n ascii与utf8的转换需要自行先从ascii转换为unicode。0.1 - \b 2013-03-08 发现WinXP的ntdll不提供UTF函数，故参考UTF8文档，重新实现。0.1~1.0 - \b 2013-03-09 扩展支持4 byte Unicode - \b 2013-05-25 处理转换时多处理一个数据的BUG。1.0~1.1 \author triones \date 2013-03-07 / #pragma once #include "blks.h" /! 转换一个unicode字符为一个utf8字符 \param utf8 utf8结果缓冲，可为NULL \param unicode unicode字符 \return 返回转换字节数，返回0表示失败 / int unicode_byte2utf8_byte(unsigned char* utf8, const unsigned long unicode); /! 转换一个utf8字符为一个unicode字符\n 注意，自动跳过非法的UTF8字符、不完整的UTF8字符 \param unicode 结果缓冲，可为NULL \param utf8 utf8字符指针 \return 返回读取utf8字节数，返回0表示失败 / int utf8_byte2unicode_byte(unsigned long* unicode, const unsigned char* const utf8); //! UNICODE串转换UTF8串(版本一) /! \param utf8 指向转换UTF8结果的缓冲区 \param max_utf8 指示转换UTF8结果的缓冲区的最大容量(以byte计) \param ws 需要转换的UNICODE串 \param ws_len 需要转换的UNICODE串的长度(以宽字计)\n ws_len缺省为-1时，视ws为null结束的串\n 注意：需要用户自行提供足够转换缓冲。另请注意参数顺序 \return 转换成功与否 \code #include "ws_s.h" char str[40]; if(!ws2utf8(str,sizeof(str),L"文字")) { cout<<"ws2utf8转换出错，LastError："<<GetLastError(); } \endcode / bool ws2utf8(unsigned char* uft8, const int max_uft8, const wchar_t* ws, const int ws_len = -1); //! UNICODE串转换UTF8串(版本二) /! \param ws 需要转换的UNICODE串 \return 转换后的对应UTF8串对象 \code #include "ws_s.h" blks<char> s = ws2utf8(L"文字"); if(s.empty()) { cout<<"ws2utf8转换出错，LastError："<<GetLastError(); } \endcode / blks<unsigned char> ws2utf8(const wchar_t* ws); //! UTF8串转换UNICODE串(版本一) /! \param ws 指向转换UNICODE结果的缓冲区 \param max_ws 指示转换UNICODE结果的缓冲区的最大容量(以宽字计) \param utf8 需要转换的UTF8串 \param utf8_len 需要转换的UTF8串的长度(以宽字计)\n utf8_len缺省为-1时，视s为null结束的串 \return 转换成功与否\n 注意：需要用户自行提供足够转换缓冲。另请注意参数顺序 / bool utf82ws(wchar_t* ws, const int max_ws, const unsigned char* utf8, const int utf8_len = -1); //! UTF8串转换UNICODE串(版本二) /! \param utf8 需要转换的UTF8串 \return 转换后的对应UNICODE串对象 / blks<wchar_t> utf82ws(const unsigned char* utf8); ------------------------------------------------ ws_utf8.cpp #include "ws_utf8.h" static const int gk_utf8_max_byte = 6; //utf8最大占用字节 #pragma warning(push) #pragma warning(disable:4244) //warning C4244: “=”: 从“const unsigned long”转换到“unsigned char”，可能丢失数据 int unicode_byte2utf8_byte(unsigned char* utf8, const unsigned long unicode) { unsigned char tu[gk_utf8_max_byte]; if(utf8 == NULL) utf8 = (unsigned char)&tu; //0000 0000 0000 0000 0000 0000 0111 1111 7bit //0000 0000 0000 0000 0000 0000 0XXX XXXX 7bit if(unicode < 0x00000080) { utf8[0] = ((unicode & 0x0000007F) >> 0) \| 0x00; return 1; } //0000 0000 0000 0000 0000 0000 10XX XXXX 6bit const unsigned long a = ((unicode & 0x0000003F) >> 0) \| 0x80; //0000 0000 0000 0000 0000 0111 1111 1111 11bit //0000 0000 0000 0000 0011 0XXX XX00 0000 5bit if(unicode < 0x00000800) { utf8[1] = a; utf8[0] = ((unicode & 0x000007C0) >> 6) \| 0xC0; return 2; } //0000 0000 0000 0000 0010 XXXX XX00 0000 6bit const unsigned long b = ((unicode & 0x00000FC0) >> 6) \| 0x80; //0000 0000 0000 0000 1111 1111 1111 1111 16bit //0000 0000 0000 1110 XXXX 0000 0000 0000 4bit if(unicode < 0x00010000) { utf8[2] = a; utf8[1] = b; utf8[0] = ((unicode & 0x0000F000) >> 12) \| 0xE0; return 3; } //0000 0000 0000 10XX XXXX 0000 0000 0000 6bit const unsigned long c = ((unicode & 0x0003F000) >> 12) \| 0x80; //0000 0000 0001 1111 1111 1111 1111 1111 21bit //0000 0011 110X XX00 0000 0000 0000 0000 3bit if(unicode < 0x00200000) { utf8[3] = a; utf8[2] = b; utf8[1] = c; utf8[0] = ((unicode & 0x001C0000) >> 18) \| 0xF0; return 4; } //0000 0010 XXXX XX00 0000 0000 0000 0000 6bit const unsigned long d = ((unicode & 0x00FC0000) >> 18) \| 0x80; //0000 0011 1111 1111 1111 1111 1111 1111 26bit //1111 10XX 0000 0000 0000 0000 0000 0000 2bit if(unicode < 0x04000000) { utf8[4] = a; utf8[3] = b; utf8[2] = c; utf8[1] = d; utf8[0] = ((unicode & 0x03000000) >> 24) \| 0xF8; return 5; } //00XX XXXX 0000 0000 0000 0000 0000 0000 6bit const unsigned long e = ((unicode & 0x3F000000) >> 24) \| 0x80; //0111 1111 1111 1111 1111 1111 1111 1111 31bit //0X00 0000 0000 0000 0000 0000 0000 0000 1bit if(unicode < 0x80000000) { utf8[5] = a; utf8[4] = b; utf8[3] = c; utf8[2] = d; utf8[1] = e; utf8[0] = ((unicode & 0x04000000) >> 30) \| 0xFC; return 6; } return 0; } #pragma warning(pop) int utf8_byte2unicode_byte(unsigned long unicode, const unsigned char* const utf8) { if(utf8 == NULL) return 0; unsigned long tu; if(unicode == NULL) unicode = &tu; const unsigned char utf8_flag[gk_utf8_max_byte] = {0x7F,0xC0,0xE0,0xF0,0xF8,0xFC}; bool done = false; int lp = 0; while(!done) { if(utf8[lp] <= utf8_flag[0]) { unicode = utf8[lp]; return 1; } if(utf8[lp] < utf8_flag[1]) //首字节非法，跳过 { ++lp; continue; } for(int i = 2; i < gk_utf8_max_byte; ++i) { if((utf8[lp] < utf8_flag[i])) { unsigned long u = utf8[lp] ^ utf8_flag[i-1]; ++lp; int j = 1; for(; j < i; ++j) { ++lp; if(utf8[lp-1] >= utf8_flag[1]) break; //后继字节非法，跳过 u <<= 6; u \|= (utf8[lp-1] & 0x3F); } if(j == i) { unicode = u; return lp; } done = true; //读取UTF字符不完整，需要重来 } if(done) { done = false; break; } } } return 0; } bool ws2utf8(unsigned char* uft8, const int max_uft8, const wchar_t * ws, const int ws_len) { if((uft8 == NULL) \|\| (max_uft8 <= 1) \|\| (ws == NULL) \|\| (ws_len == 0)) return false; int wlen = ws_len; if(ws_len < 0) { for(wlen = 0; ws[wlen] != TEXT('\0'); ++wlen); } ++wlen; int lp = 0; for(int i = 0; i < wlen ; ++i) { int k = unicode_byte2utf8_byte(&uft8[lp],ws[i]); if(k == 0) return false; lp += k; if(lp >= max_uft8) return false; } return true; } blks<unsigned char> ws2utf8(const wchar_t* ws) { blks<unsigned char> utf8; if(ws == NULL) return utf8; int wlen = 0; for(; ws[wlen] != TEXT('\0'); ++wlen); ++wlen; unsigned char u[gk_utf8_max_byte]; for(int i = 0; i < wlen ; ++i) { int k = unicode_byte2utf8_byte(u,ws[i]); if(k == 0) { utf8.clear(); break; } utf8.put(u,k); } return utf8; } bool utf82ws(wchar_t* ws, const int max_ws, const unsigned char* utf8, const int utf8_len) { if((ws == NULL) \|\| (max_ws <= 1) \|\| (utf8 == NULL) \|\| (utf8_len == 0)) return false; int ulen = utf8_len; if(utf8_len < 0) { for(ulen = 0; utf8[ulen] != '\0'; ++ulen); } ++ulen; int lp = 0; for(int i = 0; i < ulen ;) { unsigned long ch; int k = utf8_byte2unicode_byte(&ch,&utf8[i]); if(k == 0) return false; ws[lp] = (wchar_t)ch; if(lp >= max_ws) return false; ++lp; i += k; } return true; } blks<wchar_t> utf82ws(const unsigned char* utf8) { blks<wchar_t> ws; if(utf8 == NULL) return ws; int ulen = 0; for(; utf8[ulen] != '\0'; ++ulen); ++ulen; for(int i = 0; i < ulen ;) { unsigned long ch; int k = utf8_byte2unicode_byte(&ch,&utf8[i]); if(k == 0) { ws.clear(); return ws; } ws << (wchar_t)ch; i += k; } return ws; } 2013-7-29 10:09 0
Tebox 雪币： 1088 活跃值： (30) 能力值： ( LV3，RANK：20 ) 在线值：发帖 30 回帖 242 粉丝 0 关注私信	Tebox 10 楼非常感谢.我在尝试解决 2013-7-29 10:14 0
	游客登录 \| 注册方可回帖回帖表情雪币赚取及消费高级回复

萌克力

雪币： 433

活跃值： (1895)

能力值：

( LV4，RANK：40 )

在线值：

发帖

35

回帖

607

粉丝

32

关注

私信

萌克力: 2 楼

为何放弃治疗？
友情提示:bp send 跟包找上层加密函数自己写解密

2013-7-28 20:33

0

lwykj

雪币： 870

活跃值： (1033)

能力值：

( LV2，RANK：10 )

在线值：

发帖

6

回帖

240

粉丝

0

关注

私信

lwykj: 3 楼

�'`* �6y�/� �"小"冠"衣"热"公"虹"秋"野"遁"雾"火"猴"肉"帅"呜"桃"帮"侠"生"蜜"敢"鲨"八"莉(

不就是utf8么

2013-7-28 22:53

0

Tebox

雪币： 1088

活跃值： (30)

能力值：

( LV3，RANK：20 )

在线值：

发帖

30

回帖

242

粉丝

0

关注

私信

Tebox: 4 楼

从什么转到UTF8? ansi Unicode转都是乱码

2013-7-29 08:50

0

triones

雪币： 517

活跃值： (84)

能力值：

( LV12，RANK：250 )

在线值：

发帖

17

回帖

158

粉丝

1

关注

私信

triones 6: 5 楼

这编码，明显是UTF8。
win7及以上，kernel自带Unicode转UTF8的API，名字忘记了，自求度娘。
win7以下，系统不自带转换函数，需要自己实现。不过好像其它高级语言有转换的库函数，也自求度娘。

给你一个片段：

static const int gk_utf8_max_byte = 6;   //utf8最大占用字节

#pragma warning(push)
#pragma warning(disable:4244) //warning C4244: “=”: 从“const unsigned long”转换到“unsigned char”，可能丢失数据
int unicode_byte2utf8_byte(unsigned char*       utf8,
                           const unsigned long  unicode)
  {
  unsigned char tu[gk_utf8_max_byte];
  if(utf8 == NULL) utf8 = (unsigned char*)&tu;
  //0000 0000 0000 0000 0000 0000 0111 1111     7bit
  //0000 0000 0000 0000 0000 0000 0XXX XXXX     7bit
  if(unicode < 0x00000080)
    {
    utf8[0] = ((unicode & 0x0000007F) >> 0) | 0x00;
    return 1;
    }
  //0000 0000 0000 0000 0000 0000 10XX XXXX     6bit
  const unsigned long a = ((unicode & 0x0000003F) >> 0) | 0x80;
  //0000 0000 0000 0000 0000 0111 1111 1111     11bit
  //0000 0000 0000 0000 0011 0XXX XX00 0000     5bit
  if(unicode < 0x00000800)
    {
    utf8[1] = a;
    utf8[0] = ((unicode & 0x000007C0) >> 6) | 0xC0;
    return 2;
    }
  //0000 0000 0000 0000 0010 XXXX XX00 0000     6bit
  const unsigned long b = ((unicode & 0x00000FC0) >> 6) | 0x80;
  //0000 0000 0000 0000 1111 1111 1111 1111     16bit
  //0000 0000 0000 1110 XXXX 0000 0000 0000     4bit
  if(unicode < 0x00010000)
    {
    utf8[2] = a;  utf8[1] = b;
    utf8[0] = ((unicode & 0x0000F000) >> 12) | 0xE0;
    return 3;
    }
  //0000 0000 0000 10XX XXXX 0000 0000 0000     6bit
  const unsigned long c = ((unicode & 0x0003F000) >> 12) | 0x80;
  //0000 0000 0001 1111 1111 1111 1111 1111     21bit
  //0000 0011 110X XX00 0000 0000 0000 0000     3bit
  if(unicode < 0x00200000)
    {
    utf8[3] = a;  utf8[2] = b;  utf8[1] = c;
    utf8[0] = ((unicode & 0x001C0000) >> 18) | 0xF0;
    return 4;
    }
  //0000 0010 XXXX XX00 0000 0000 0000 0000     6bit
  const unsigned long d = ((unicode & 0x00FC0000) >> 18) | 0x80;
  //0000 0011 1111 1111 1111 1111 1111 1111     26bit
  //1111 10XX 0000 0000 0000 0000 0000 0000     2bit
  if(unicode < 0x04000000)
    {
    utf8[4] = a;  utf8[3] = b;  utf8[2] = c;  utf8[1] = d;
    utf8[0] = ((unicode & 0x03000000) >> 24) | 0xF8;
    return 5;
    }
  //00XX XXXX 0000 0000 0000 0000 0000 0000     6bit
  const unsigned long e = ((unicode & 0x3F000000) >> 24) | 0x80;
  //0111 1111 1111 1111 1111 1111 1111 1111     31bit
  //0X00 0000 0000 0000 0000 0000 0000 0000     1bit
  if(unicode < 0x80000000)
    {
    utf8[5] = a;  utf8[4] = b;  utf8[3] = c;  utf8[2] = d;  utf8[1] = e;
    utf8[0] = ((unicode & 0x04000000) >> 30) | 0xFC;
    return 6;
    }
  return 0;
  }
#pragma warning(pop)

2013-7-29 08:58

0

Tebox

雪币： 1088

活跃值： (30)

能力值：

( LV3，RANK：20 )

在线值：

发帖

30

回帖

242

粉丝

0

关注

私信

Tebox: 6 楼

不对.Unicode转UTF8中文转换后结果是小这个样子的~

2013-7-29 09:19

0

triones

雪币： 517

活跃值： (84)

能力值：

( LV12，RANK：250 )

在线值：

发帖

17

回帖

158

粉丝

1

关注

私信

triones 6: 7 楼

哥，代码都贴给你了，还不信。不信去问百度http://baike.baidu.com/view/25412.htm
看了utf8编码的定义就可以知道，只要是中文，utf8编码应该都是3 byte，且都以0xE开头。
u5C0F是“小”的Unicode编码，不是UTF8编码

2013-7-29 09:23

0

Tebox

雪币： 1088

活跃值： (30)

能力值：

( LV3，RANK：20 )

在线值：

发帖

30

回帖

242

粉丝

0

关注

私信

Tebox: 8 楼

是的.我看过了.
我想把封包转换成中文.直接套用易语言拿模块测试了下Utf8到Unicode的代码.
可是转换依然未实现中文的效果. 显示全是乱码..

恕我愚钝..请指教.

2013-7-29 09:43

0

triones

雪币： 517

活跃值： (84)

能力值：

( LV12，RANK：250 )

在线值：

发帖

17

回帖

158

粉丝

1

关注

私信

triones 6: 9 楼

代码给你，有用的自己抠出来。到这地步你还不会，我也没办法了。
我看了网上在线转换UTF8的点，都很扯，转换完全不正确。
至于易语言，我不知道是不是它的实现不行，还是你的参数不正确。

建议去理解一下Unicode编码规范以及UTF-8编码规范。

另外，你贴出来的原始封包，并不是全是UTF-8，只有中间一段文字是UTF-8，不能全转换

ws_utf8.h

/*!
  \file  ws_utf8.h
  \brief ws_utf8.h用于如UTF8与UNICODE相互转换

  \section ws_utf8 ver 1.1.1305.2510 (For All)

  - \b 2013-03-07 新增unicode与utf8编码的转换。\n
    ascii与utf8的转换需要自行先从ascii转换为unicode。0.1
  - \b 2013-03-08 发现WinXP的ntdll不提供UTF函数，故参考UTF8文档，重新实现。0.1~1.0
  - \b 2013-03-09 扩展支持4 byte Unicode
  - \b 2013-05-25 处理转换时多处理一个数据的BUG。1.0~1.1

  \author     triones
  \date       2013-03-07
*/

#pragma once

#include "blks.h"

/*!
  转换一个unicode字符为一个utf8字符
  \param utf8     utf8结果缓冲，可为NULL
  \param unicode  unicode字符
  \return         返回转换字节数，返回0表示失败
*/
int unicode_byte2utf8_byte(unsigned char*       utf8,
                           const unsigned long  unicode);

/*!
  转换一个utf8字符为一个unicode字符\n
  注意，自动跳过非法的UTF8字符、不完整的UTF8字符
  \param unicode  结果缓冲，可为NULL
  \param utf8     utf8字符指针
  \return         返回读取utf8字节数，返回0表示失败
*/
int utf8_byte2unicode_byte(unsigned long*             unicode,
                           const unsigned char* const utf8);

//! UNICODE串转换UTF8串(版本一)
/*!
  \param utf8   指向转换UTF8结果的缓冲区
  \param max_utf8  指示转换UTF8结果的缓冲区的最大容量(以byte计)
  \param ws     需要转换的UNICODE串
  \param ws_len 需要转换的UNICODE串的长度(以宽字计)\n
                ws_len缺省为-1时，视ws为null结束的串\n
                注意：需要用户自行提供足够转换缓冲。另请注意参数顺序
  \return       转换成功与否
  \code
    #include "ws_s.h"
    char str[40];
    if(!ws2utf8(str,sizeof(str),L"文字"))
      {
      cout<<"ws2utf8转换出错，LastError："<<GetLastError();
      }
  \endcode
*/
bool ws2utf8(unsigned char* uft8,
             const int      max_uft8,
             const wchar_t* ws,
             const int      ws_len = -1);

//! UNICODE串转换UTF8串(版本二)
/*!
  \param ws 需要转换的UNICODE串
  \return   转换后的对应UTF8串对象
  \code
    #include "ws_s.h"
    blks<char> s = ws2utf8(L"文字");
    if(s.empty())
      {
      cout<<"ws2utf8转换出错，LastError："<<GetLastError();
      }
  \endcode
*/
blks<unsigned char> ws2utf8(const wchar_t* ws);

//! UTF8串转换UNICODE串(版本一)
/*!
  \param ws     指向转换UNICODE结果的缓冲区
  \param max_ws 指示转换UNICODE结果的缓冲区的最大容量(以宽字计)
  \param utf8   需要转换的UTF8串
  \param utf8_len  需要转换的UTF8串的长度(以宽字计)\n
                utf8_len缺省为-1时，视s为null结束的串
  \return       转换成功与否\n
                注意：需要用户自行提供足够转换缓冲。另请注意参数顺序
*/
bool utf82ws(wchar_t*             ws,
             const int            max_ws,
             const unsigned char* utf8,
             const int            utf8_len = -1);

//! UTF8串转换UNICODE串(版本二)
/*!
  \param utf8   需要转换的UTF8串
  \return       转换后的对应UNICODE串对象
*/
blks<wchar_t> utf82ws(const unsigned char* utf8);

------------------------------------------------
ws_utf8.cpp

#include "ws_utf8.h"

static const int gk_utf8_max_byte = 6;   //utf8最大占用字节

#pragma warning(push)
#pragma warning(disable:4244) //warning C4244: “=”: 从“const unsigned long”转换到“unsigned char”，可能丢失数据
int unicode_byte2utf8_byte(unsigned char*       utf8,
                           const unsigned long  unicode)
  {
  unsigned char tu[gk_utf8_max_byte];
  if(utf8 == NULL) utf8 = (unsigned char*)&tu;
  //0000 0000 0000 0000 0000 0000 0111 1111     7bit
  //0000 0000 0000 0000 0000 0000 0XXX XXXX     7bit
  if(unicode < 0x00000080)
    {
    utf8[0] = ((unicode & 0x0000007F) >> 0) | 0x00;
    return 1;
    }
  //0000 0000 0000 0000 0000 0000 10XX XXXX     6bit
  const unsigned long a = ((unicode & 0x0000003F) >> 0) | 0x80;
  //0000 0000 0000 0000 0000 0111 1111 1111     11bit
  //0000 0000 0000 0000 0011 0XXX XX00 0000     5bit
  if(unicode < 0x00000800)
    {
    utf8[1] = a;
    utf8[0] = ((unicode & 0x000007C0) >> 6) | 0xC0;
    return 2;
    }
  //0000 0000 0000 0000 0010 XXXX XX00 0000     6bit
  const unsigned long b = ((unicode & 0x00000FC0) >> 6) | 0x80;
  //0000 0000 0000 0000 1111 1111 1111 1111     16bit
  //0000 0000 0000 1110 XXXX 0000 0000 0000     4bit
  if(unicode < 0x00010000)
    {
    utf8[2] = a;  utf8[1] = b;
    utf8[0] = ((unicode & 0x0000F000) >> 12) | 0xE0;
    return 3;
    }
  //0000 0000 0000 10XX XXXX 0000 0000 0000     6bit
  const unsigned long c = ((unicode & 0x0003F000) >> 12) | 0x80;
  //0000 0000 0001 1111 1111 1111 1111 1111     21bit
  //0000 0011 110X XX00 0000 0000 0000 0000     3bit
  if(unicode < 0x00200000)
    {
    utf8[3] = a;  utf8[2] = b;  utf8[1] = c;
    utf8[0] = ((unicode & 0x001C0000) >> 18) | 0xF0;
    return 4;
    }
  //0000 0010 XXXX XX00 0000 0000 0000 0000     6bit
  const unsigned long d = ((unicode & 0x00FC0000) >> 18) | 0x80;
  //0000 0011 1111 1111 1111 1111 1111 1111     26bit
  //1111 10XX 0000 0000 0000 0000 0000 0000     2bit
  if(unicode < 0x04000000)
    {
    utf8[4] = a;  utf8[3] = b;  utf8[2] = c;  utf8[1] = d;
    utf8[0] = ((unicode & 0x03000000) >> 24) | 0xF8;
    return 5;
    }
  //00XX XXXX 0000 0000 0000 0000 0000 0000     6bit
  const unsigned long e = ((unicode & 0x3F000000) >> 24) | 0x80;
  //0111 1111 1111 1111 1111 1111 1111 1111     31bit
  //0X00 0000 0000 0000 0000 0000 0000 0000     1bit
  if(unicode < 0x80000000)
    {
    utf8[5] = a;  utf8[4] = b;  utf8[3] = c;  utf8[2] = d;  utf8[1] = e;
    utf8[0] = ((unicode & 0x04000000) >> 30) | 0xFC;
    return 6;
    }
  return 0;
  }
#pragma warning(pop)

int utf8_byte2unicode_byte(unsigned long*             unicode,
                           const unsigned char* const utf8)
  {
  if(utf8 == NULL)  return 0;
  unsigned long tu;
  if(unicode == NULL) unicode = &tu;
  const unsigned char utf8_flag[gk_utf8_max_byte] =
    {0x7F,0xC0,0xE0,0xF0,0xF8,0xFC};
  bool done = false;
  int lp = 0;
  while(!done)
    {
    if(utf8[lp] <= utf8_flag[0])
      {
      *unicode = utf8[lp];
      return 1;
      }
    if(utf8[lp] < utf8_flag[1])   //首字节非法，跳过
      {
      ++lp;
      continue;
      }
    for(int i = 2; i < gk_utf8_max_byte; ++i)
      {
      if((utf8[lp] < utf8_flag[i]))
        {
        unsigned long u = utf8[lp] ^ utf8_flag[i-1];
        ++lp;
        int j = 1;
        for(; j < i; ++j)
          {
          ++lp;
          if(utf8[lp-1] >= utf8_flag[1])  break;  //后继字节非法，跳过
          u <<= 6;
          u |= (utf8[lp-1] & 0x3F);
          }
        if(j == i)
          {
          *unicode = u;
          return lp;
          }
        done = true;  //读取UTF字符不完整，需要重来
        }
      if(done)
        {
        done = false;
        break;
        }
      }
    }
  return 0;
  }

bool ws2utf8(unsigned char*   uft8,
             const int        max_uft8,
             const wchar_t *  ws,
             const int        ws_len)
  {
  if((uft8 == NULL) || (max_uft8 <= 1) || (ws == NULL) || (ws_len == 0))  return false;
  int wlen = ws_len;
  if(ws_len < 0)
    {
    for(wlen = 0; ws[wlen] != TEXT('\0'); ++wlen);
    }
  ++wlen;
  int lp = 0;
  for(int i = 0; i < wlen ; ++i)
    {
    int k = unicode_byte2utf8_byte(&uft8[lp],ws[i]);
    if(k == 0)  return false;
    lp += k;
    if(lp >= max_uft8)  return false;
    }
  return true;
  }

blks<unsigned char> ws2utf8(const wchar_t* ws)
  {
  blks<unsigned char> utf8;
  if(ws == NULL)  return utf8;
  int wlen = 0;
  for(; ws[wlen] != TEXT('\0'); ++wlen);
  ++wlen;
  unsigned char u[gk_utf8_max_byte];
  for(int i = 0; i < wlen ; ++i)
    {
    int k = unicode_byte2utf8_byte(u,ws[i]);
    if(k == 0)
      {
      utf8.clear();
      break;
      }
    utf8.put(u,k);
    }
  return utf8;
  }

bool utf82ws(wchar_t*             ws,
             const int            max_ws,
             const unsigned char* utf8,
             const int            utf8_len)
  {
  if((ws == NULL) || (max_ws <= 1) || (utf8 == NULL) || (utf8_len == 0))  return false;
  int ulen = utf8_len;
  if(utf8_len < 0)
    {
    for(ulen = 0; utf8[ulen] != '\0'; ++ulen);
    }
  ++ulen;
  int lp = 0;
  for(int i = 0; i < ulen ;)
    {
    unsigned long ch;
    int k = utf8_byte2unicode_byte(&ch,&utf8[i]);
    if(k == 0)  return false;
    ws[lp] = (wchar_t)ch;
    if(lp >= max_ws)  return false;
    ++lp;
    i += k;
    }
  return true;
  }

blks<wchar_t> utf82ws(const unsigned char* utf8)
  {
  blks<wchar_t> ws;
  if(utf8 == NULL)  return ws;
  int ulen = 0;
  for(; utf8[ulen] != '\0'; ++ulen);
  ++ulen;
  for(int i = 0; i < ulen ;)
    {
    unsigned long ch;
    int k = utf8_byte2unicode_byte(&ch,&utf8[i]);
    if(k == 0)
      {
      ws.clear();
      return ws;
      }
    ws << (wchar_t)ch;
    i += k;
    }
  return ws;
  }

2013-7-29 10:09

0

Tebox

雪币： 1088

活跃值： (30)

能力值：

( LV3，RANK：20 )

在线值：

发帖

30

回帖

242

粉丝

0

关注

私信

Tebox: 10 楼

非常感谢.我在尝试解决

2013-7-29 10:14

0