BIG5 中文轉 UTF-8 中文資料庫轉換、檔案轉換工具

 
#
# 轉單一個 BIG5 大五碼檔案
#

$ /usr/bin/php big5_to_utf8_conv_xargs.php <ORIGINAL_BIG5_FILES>

#
# 配合 find 一次轉多個 BIG5 大五碼檔案
#
$ find . -name "*.htm" | xargs /usr/bin/php big5_to_utf8_conv_xargs.php


$ find . -name "*.htm.UTF-8" | /usr/bin/rename -f 's/\.UTF-8$//'


$from_char = 'BIG-5';  // 來源編碼
$to_char = 'UTF-8';    // 對象編碼
$tmp_folder = '/tmp/'; // 暫存目錄
$encoding_detect_tool = '/usr/bin/file -b'; 
$big5_exception_all ==> 因為你一定已經處理過「BIG5::淚許蓋功餐」之類的字，所以轉完之後，你的資料可能會變成「功\夫，淚\眼」有 Back Slashes，要再一次轉回來；由於處理所有字元，太過於消耗時間，所以我只挑出常用字
$force ==> 強制轉，不做判斷（因為判斷也可能錯誤）


<?PHP
/*
 * php_big5_to_utf8_conv.php
 *
 * ------
 * Usage:
 * ------
 * # /usr/bin/php php_big5_to_utf8_conv.php <FILENAME/SQL_DUMP>
 *
 * The output file will be ".UTF8" postfixed
 * e.g.
 *     mysqldb.sql --> mysqldb.sql.utf8
 *
 * ------
 * Notes:
 * ------
 * => Convert BIG5 database or text/htm/php files to UTF-8 mainly;
 *    it could be used to convert other encoding by touching $from_char or $to_char, in theory.
 *
 *    主要用來轉換中文 BIG5 --> UTF8，「理論上」也可以轉其他編碼。
 *
 *
 * Rev: 0.1 ... service@vovo2000.com
 */

$from_char = 'BIG-5';
$to_char = 'UTF-8';
$tmp_folder = '/tmp/';
$encoding_detect_tool = '/usr/bin/file -b';

// 強制轉換
// $force = '--force';
$force = '';

for ($j=1; $j < count($argv); $j++)
{
    if (isset($argv[$j]) && file_exists($argv[$j]))
    {
        $filename = trim($argv[$j]);
        $tmpfile = $tmp_folder.time().'.txt';

        $cmd = "$encoding_detect_tool $filename > $tmpfile";
        // echo "Execute $cmd\n";
        exec($cmd);
        $orig_encoding = trim(file_get_contents($tmpfile));
        unlink($tmpfile);

        echo "Detect encoding: $filename => ".$orig_encoding."\n";

        // If you want to convert it anyway
   // $argv[2] == '--force';
      
        if (
             (strpos($orig_encoding, $to_char) === FALSE && strpos($orig_encoding, 'ASCII') === FALSE)
             || strpos($orig_encoding, 'extended-ASCII')
             || $force == '--force'
           )
        {
            $new_filename = $filename.'.'.$to_char;
            echo "Converting $filename --> $new_filename\n";
            $str = file_get_contents($filename);
            $new = mb_convert_encoding ($str, $to_char, $from_char);

            mb_internal_encoding($to_char);

            if ($from_char == 'BIG-5')
            {
                //
                // Hanlde 功\ 許\ 淚\ :: ie. You already workarounded with ADD_SLASHES in your database!
                //
                /*
                $big5_exception_all = '么功吒吭沔坼歿俞枯苒娉珮豹崤淚許廄琵跚愧稞鈾暝蓋墦穀閱璞餐縷擺黠孀踊髏躡';
                $big5_exception_all .= '尐佢汻岤垥柦胐娖涂罡偅惝牾莍傜揊焮茻鄃幋滜綅赨塿縷槙擺箤踊嫹髏潿蔌醆嬞獦';
                $big5_exception_all .= '佢螏餤燡螰駹礒鎪瀙酀瀵騱酅贕鱋鱭';
                 */
                //
                // Got a lot of data? Handle common seen characters only.
                //
                $big5_exception = '么功吒吭歿俞枯苒娉珮豹淚許廄琵跚愧鈾蓋穀閱璞餐縷擺黠孀踊髏涂罡傜縷槙擺踊髏礒';

                $big5_array_orig = array();
                $big5_array_new = array();
                $i = 0;
                for ($i = 0; $i < mb_strlen($big5_exception, 'UTF-8'); $i++)
                {
                    $this_big5_char = mb_substr($big5_exception, $i, 1, 'UTF-8');
                    $big5_array_new[$i] = $this_big5_char;

                    /* We are going to use preg_replace, DUAL backslash required */
                    $this_big5_char .= '\\\\';
                    $big5_array_orig[$i] = $this_big5_char;
                }

                //
                // Make all '功\' -> '功' try to remedy the big5-workaround back to normal
                //
                for ($i = 0; $i < mb_strlen($big5_exception, 'UTF-8'); $i++)
                {
                    //
                    // Use 'u' pattern modifier: http://php.net/manual/en/reference.pcre.pattern.modifiers.php
                    //
                    $new = preg_replace('/'.$big5_array_orig[$i].'/u',
                            $big5_array_new[$i],
                            $new);
                    echo $big5_array_new[$i].' ';
                }
            }

            file_put_contents($new_filename, $new);
            echo "\nDone: File Length ".strlen($str).' --> '.strlen($new)."\n";
        }
        else
        {
            echo "Already an $to_char or pure ASCII file, skip convert\n";
        }
    }
    else
    {
      echo 'Usage:
        # -------------------------------------------------------------------
        # Recursively convert all *.htm files in this forlder recursively
        # -------------------------------------------------------------------
        #
        # find . -name "*.htm" | xargs /usr/bin/ /tmp/big5_to_utf8_conv_xargs_.php
        #
        #
        # -------------------------------------------------------------------
        # Recursively "Rename" the converted UTF-8 file to overwrite original htm
        # -------------------------------------------------------------------
        #
        # find . -name "*.htm.UTF-8" | rename -f "s/\.UTF-8$//"';
        exit;
    }
}
?>


$ STOP your web application (e.g. APACHE2, NGINX, IIS)


$ mysqldump --user=<USERNAME> -p --default-character-set=<ORIG_CHARSET> -c --insert-ignore --skip-extended-insert --skip-set-charset -r zz_mysql_table.SQL <DATABASE_NAME> <DATABASE_TABLE>


$ /usr/bin/php php_big5_to_utf8_conv_xargs.php zz_mysql_table.SQL


$ perl -pi -w -e 's/CHARSET=<ORIG_CHAR_SET>/CHARSET=utf8/g;' zz_mysql_table.SQL.UTF-8


mysql --user=<USERNAME> --max_allowed_packet=32M -p --default-character-set=utf8 <DATABASE_NAME> < zz_mysql_table.SQL.UTF-8


      ERROR at line 24945: Unknown command '\"'.


      ?,'  ===>  ','

      ?);  ===>  ');

      \'   ===>  \\'


$ iconv -f BIG-5 -t -UTF-8 <ORIGINAL_BIG5_FILENAME> -o <TARGET_UTF8_FILENAME>


$ file index.html
file index.html
HTML document, UTF-8 Unicode text, with very long lines, with CRLF line terminators


$ man file

NAME
     file - determine file type

SYNOPSIS
     file [-bchiklLNnprsvz0] [--apple] [--mime-encoding] [--mime-type] [-e testname] [-F separator] [-f namefile]
          [-m magicfiles] file ...
     file -C [-m magicfiles]
     file [--help]


简体中文
郑竹吟
博多ラーメン
小哈工作坊鄭竹吟


&#31616;体中文
&#37073;竹吟
博多&#12521;&#12540;&#12513;&#12531;
小哈工作坊鄭竹吟


function numeric_utf8($number_char_str)
{
    return preg_replace_callback("/(&#[0-9]+;)/",
            function($m) { return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); },
            $number_char_str);
}

BIG5 轉 UTF8 萬國碼資料庫轉換、檔案轉換工具

big5_to_utf8_conv_xargs.php （同時處理 "淚許蓋功餐" Back Slash）

利用 php_big5_to_utf8_conv 轉換 MYSQL 資料庫

DUMP 回去的過程仍然可能遇到錯誤

附上 ICONV 和 BSD::FILE 使用教學

處理簡體中文、日文、韓文等 "&#數字分號" Unicode 參照碼

延伸閱讀：什麼是 (釵h) (奶O) (傍@) (誘l) (紫])