|
代码是MS记事本中用的字符识别代码
可以有效的识别UTF8或者是UNICODE
第一步是判断存储的文件头,如果没有文件头,则判断字符范围
- /* IsTextUTF8
- *
- * UTF-8 is the encoding of Unicode based on Internet Society RFC2279
- *
- * Basicly:
- * 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
- * 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
- * 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
- * (this keeps going for 32 bit unicode)
- *
- *
- * Return value: TRUE, if the text is in UTF-8 format.
- * FALSE, if the text is not in UTF-8 format.
- * We will also return FALSE is it is only 7-bit ascii, so the right code page
- * will be used.
- *
- * Actually for 7 bit ascii, it doesn't matter which code page we use, but
- * notepad will remember that it is utf-8 and "save" or "save as" will store
- * the file with a UTF-8 BOM. Not cool.
- */
- INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
- {
- INT i;
- DWORD cOctets; // octets to go in this UTF-8 encoded character
- UCHAR chr;
- BOOL bAllAscii= TRUE;
- cOctets= 0;
- for( i=0; i < iLen; i++ ) {
- chr= *(lpstrInputStream+i);
- if( (chr&0x80) != 0 ) bAllAscii= FALSE;
- if( cOctets == 0 ) {
- //
- // 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
- //
- if( chr >= 0x80 ) {
- //
- // count of the leading 1 bits is the number of characters encoded
- //
- do {
- chr <<= 1;
- cOctets++;
- }
- while( (chr&0x80) != 0 );
- cOctets--; // count includes this character
- if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
- }
- }
- else {
- // non-leading bytes must start as 10xxxxxx
- if( (chr&0xC0) != 0x80 ) {
- return FALSE;
- }
- cOctets--; // processed another octet in encoding
- }
- }
- //
- // End of text. Check for consistency.
- //
- if( cOctets > 0 ) { // anything left over at the end is an error
- return FALSE;
- }
- if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
- return FALSE;
- }
- return TRUE;
- }
- /* IsInputTextUnicode
- * Verify if the input stream is in Unicode format.
- *
- * Return value: TRUE, if the text is in Unicode format.
- *
- * 29 June 1998
- */
- INT IsInputTextUnicode (LPSTR lpstrInputStream, INT iLen)
- {
- INT iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE
- BOOL bUnicode;
- // We would like to check the possibility
- // of IS_TEXT_UNICODE_DBCS_LEADBYTE.
- //
- bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult);
- if (bUnicode &&
- ((iResult & IS_TEXT_UNICODE_STATISTICS) != 0 ) &&
- ((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 ) )
- {
- CPINFO cpiInfo;
- CHAR* pch= (CHAR*)lpstrInputStream;
- INT cb;
- //
- // If the result depends only upon statistics, check
- // to see if there is a possibility of DBCS.
- // Only do this check if the ansi code page is DBCS
- //
- GetCPInfo( CP_ACP, &cpiInfo);
- if( cpiInfo.MaxCharSize > 1 )
- {
- for( cb=0; cb<iLen; cb++ )
- {
- if( IsDBCSLeadByte(*pch++) )
- {
- return FALSE;
- }
- }
- }
- }
- return bUnicode;
- }
- #define UNICODE_FFFF 0xFFFF
- #define REVERSE_BYTE_ORDER_MARK 0xFFFE
- #define BYTE_ORDER_MARK 0xFEFF
- lpBuf= MapViewOfFile( hMap, FILE_MAP_READ, 0,0,len);
- lpBufAfterBOM= (LPSTR) lpBuf;
- if( typeFlag == FT_UNKNOWN )
- {
- switch(*lpBuf)
- {
- case BYTE_ORDER_MARK:
- bUnicode= TRUE;
- ftOpenedAs= FT_UNICODE;
- // don't count the BOM.
- nChars= len / sizeof(TCHAR) -1;
- break;
- case REVERSE_BYTE_ORDER_MARK:
- bUnicode= TRUE;
- ftOpenedAs= FT_UNICODEBE;
- // don't count the BOM.
- nChars= len / sizeof(TCHAR) -1;
- break;
- // UTF bom has 3 bytes; if it doesn't have UTF BOM just fall through ..
- case BOM_UTF8_HALF:
- if (len > 2 && ((BYTE) *(((LPSTR)lpBuf)+2) == BOM_UTF8_2HALF) )
- {
- bUTF8= TRUE;
- cpTemp= CP_UTF8;
- ftOpenedAs= FT_UTF8;
- // Ignore the first three bytes.
- lpBufAfterBOM= (LPSTR)lpBuf + 3;
- len -= 3;
- break;
- }
- default:
- // Is the file unicode without BOM ?
- if ((bUnicode= IsInputTextUnicode((LPSTR) lpBuf, len)))
- {
- ftOpenedAs= FT_UNICODE;
- nChars= len / sizeof(TCHAR);
- }
- else
- {
- // Is the file UTF-8 even though it doesn't have UTF-8 BOM.
- if ((bUTF8= IsTextUTF8((LPSTR) lpBuf, len)))
- {
- ftOpenedAs= FT_UTF8;
- cpTemp= CP_UTF8;
- }
- // well, not it must be an ansi file!
- else
- {
- ftOpenedAs= FT_ANSI;
- cpTemp= CP_ACP;
- }
- }
- break;
- }
- }
复制代码 |
|