将 SQL Server 中的文本值从 UTF8 转换为 ISO 8859-1



我在SQL Server中有一列,带有utf8 SQL_Latin1_General_CP1_CI_AS编码。如何转换和保存ISO 8859-1编码的文本?我想在SQL Server上的查询中做一些事情。有什么提示吗?

奥尔·Gostei do jogo.Quando "baixei" até achei que não iria curtir muito

我编写了一个函数来修复存储在varchar字段中的 UTF-8 文本。

要检查固定值,您可以像这样使用它:

CREATE TABLE #Table1 (Column1 varchar(max))
INSERT #Table1
VALUES ('Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito')
SELECT *, NewColumn1 = dbo.DecodeUTF8String(Column1)
FROM Table1
WHERE Column1 <> dbo.DecodeUTF8String(Column1)

输出:

Column1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito
NewColumn1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito

代码:

CREATE FUNCTION dbo.DecodeUTF8String (@value varchar(max))
RETURNS nvarchar(max)
AS
BEGIN
    -- Transforms a UTF-8 encoded varchar string into Unicode
    -- By Anthony Faull 2014-07-31
    DECLARE @result nvarchar(max);
    -- If ASCII or null there's no work to do
    IF (@value IS NULL
        OR @value NOT LIKE '%[^ -~]%' COLLATE Latin1_General_BIN
    )
        RETURN @value;
    -- Generate all integers from 1 to the length of string
    WITH e0(n) AS (SELECT TOP(POWER(2,POWER(2,0))) NULL FROM (VALUES (NULL),(NULL)) e(n))
        , e1(n) AS (SELECT TOP(POWER(2,POWER(2,1))) NULL FROM e0 CROSS JOIN e0 e)
        , e2(n) AS (SELECT TOP(POWER(2,POWER(2,2))) NULL FROM e1 CROSS JOIN e1 e)
        , e3(n) AS (SELECT TOP(POWER(2,POWER(2,3))) NULL FROM e2 CROSS JOIN e2 e)
        , e4(n) AS (SELECT TOP(POWER(2,POWER(2,4))) NULL FROM e3 CROSS JOIN e3 e)
        , e5(n) AS (SELECT TOP(POWER(2.,POWER(2,5)-1)-1) NULL FROM e4 CROSS JOIN e4 e)
    , numbers(position) AS
    (
        SELECT TOP(DATALENGTH(@value)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
        FROM e5
    )
    -- UTF-8 Algorithm (http://en.wikipedia.org/wiki/UTF-8)
    -- For each octet, count the high-order one bits, and extract the data bits.
    , octets AS
    (
        SELECT position, highorderones, partialcodepoint
        FROM numbers a
        -- Split UTF8 string into rows of one octet each.
        CROSS APPLY (SELECT octet = ASCII(SUBSTRING(@value, position, 1))) b
        -- Count the number of leading one bits
        CROSS APPLY (SELECT highorderones = 8 - FLOOR(LOG( ~CONVERT(tinyint, octet) * 2 + 1)/LOG(2))) c
        CROSS APPLY (SELECT databits = 7 - highorderones) d
        CROSS APPLY (SELECT partialcodepoint = octet % POWER(2, databits)) e
    )
    -- Compute the Unicode codepoint for each sequence of 1 to 4 bytes
    , codepoints AS
    (
        SELECT position, codepoint
        FROM
        (
            -- Get the starting octect for each sequence (i.e. exclude the continuation bytes)
            SELECT position, highorderones, partialcodepoint
            FROM octets
            WHERE highorderones <> 1
        ) lead
        CROSS APPLY (SELECT sequencelength = CASE WHEN highorderones in (1,2,3,4) THEN highorderones ELSE 1 END) b
        CROSS APPLY (SELECT endposition = position + sequencelength - 1) c
        CROSS APPLY
        (
            -- Compute the codepoint of a single UTF-8 sequence
            SELECT codepoint = SUM(POWER(2, shiftleft) * partialcodepoint)
            FROM octets
            CROSS APPLY (SELECT shiftleft = 6 * (endposition - position)) b
            WHERE position BETWEEN lead.position AND endposition
        ) d
    )
    -- Concatenate the codepoints into a Unicode string
    SELECT @result = CONVERT(xml,
        (
            SELECT NCHAR(codepoint)
            FROM codepoints
            ORDER BY position
            FOR XML PATH('')
        )).value('.', 'nvarchar(max)');
    RETURN @result;
END
GO

Jason Penny还编写了一个SQL函数来将UTF-8转换为Unicode(MIT许可证),它为我提供了一个简单的示例:

CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX))
   RETURNS NVarChar(MAX)
AS
BEGIN
   DECLARE @out NVarChar(MAX), @i int, @c int, @c2 int, @c3 int, @nc int
   SELECT @i = 1, @out = ''
   WHILE (@i <= Len(@in))
   BEGIN
      SET @c = Ascii(SubString(@in, @i, 1))
      IF (@c < 128)
      BEGIN
         SET @nc = @c
         SET @i = @i + 1
      END
      ELSE IF (@c > 191 AND @c < 224)
      BEGIN
         SET @c2 = Ascii(SubString(@in, @i + 1, 1))
         SET @nc = (((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
         SET @i = @i + 2
      END
      ELSE
      BEGIN
         SET @c2 = Ascii(SubString(@in, @i + 1, 1))
         SET @c3 = Ascii(SubString(@in, @i + 2, 1))
         SET @nc = (((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
         SET @i = @i + 3
      END
      SET @out = @out + NChar(@nc)
   END
   RETURN @out
END
GO

安东尼的打勾答案对我来说"看起来"更好,但如果进行转换并调查任何差异,也许可以同时运行?!

此外,我们还使用了下面非常丑陋的代码来检测 BMP 页面 unicode 字符,这些字符被编码为 UTF-8,然后从 varchar 转换为 nvarchar 字段,可以转换为 UCS-16。

LIKE (N'%[' + CONVERT(NVARCHAR,(CHAR(192))) + CONVERT(NVARCHAR,(CHAR(193))) + CONVERT(NVARCHAR,(CHAR(194))) + CONVERT(NVARCHAR,(CHAR(195))) + CONVERT(NVARCHAR,(CHAR(196))) + CONVERT(NVARCHAR,(CHAR(197))) + CONVERT(NVARCHAR,(CHAR(198))) + CONVERT(NVARCHAR,(CHAR(199))) + CONVERT(NVARCHAR,(CHAR(200))) + CONVERT(NVARCHAR,(CHAR(201))) + CONVERT(NVARCHAR,(CHAR(202))) + CONVERT(NVARCHAR,(CHAR(203))) + CONVERT(NVARCHAR,(CHAR(204))) + CONVERT(NVARCHAR,(CHAR(205))) + CONVERT(NVARCHAR,(CHAR(206))) + CONVERT(NVARCHAR,(CHAR(207))) + CONVERT(NVARCHAR,(CHAR(208))) + CONVERT(NVARCHAR,(CHAR(209))) + CONVERT(NVARCHAR,(CHAR(210))) + CONVERT(NVARCHAR,(CHAR(211))) + CONVERT(NVARCHAR,(CHAR(212))) + CONVERT(NVARCHAR,(CHAR(213))) + CONVERT(NVARCHAR,(CHAR(214))) + CONVERT(NVARCHAR,(CHAR(215))) + CONVERT(NVARCHAR,(CHAR(216))) + CONVERT(NVARCHAR,(CHAR(217))) + CONVERT(NVARCHAR,(CHAR(218))) + CONVERT(NVARCHAR,(CHAR(219))) + CONVERT(NVARCHAR,(CHAR(220))) + CONVERT(NVARCHAR,(CHAR(221))) + CONVERT(NVARCHAR,(CHAR(222))) + CONVERT(NVARCHAR,(CHAR(223))) + CONVERT(NVARCHAR,(CHAR(224))) + CONVERT(NVARCHAR,(CHAR(225))) + CONVERT(NVARCHAR,(CHAR(226))) + CONVERT(NVARCHAR,(CHAR(227))) + CONVERT(NVARCHAR,(CHAR(228))) + CONVERT(NVARCHAR,(CHAR(229))) + CONVERT(NVARCHAR,(CHAR(230))) + CONVERT(NVARCHAR,(CHAR(231))) + CONVERT(NVARCHAR,(CHAR(232))) + CONVERT(NVARCHAR,(CHAR(233))) + CONVERT(NVARCHAR,(CHAR(234))) + CONVERT(NVARCHAR,(CHAR(235))) + CONVERT(NVARCHAR,(CHAR(236))) + CONVERT(NVARCHAR,(CHAR(237))) + CONVERT(NVARCHAR,(CHAR(238))) + CONVERT(NVARCHAR,(CHAR(239)))
    + N'][' + CONVERT(NVARCHAR,(CHAR(128))) + CONVERT(NVARCHAR,(CHAR(129))) + CONVERT(NVARCHAR,(CHAR(130))) + CONVERT(NVARCHAR,(CHAR(131))) + CONVERT(NVARCHAR,(CHAR(132))) + CONVERT(NVARCHAR,(CHAR(133))) + CONVERT(NVARCHAR,(CHAR(134))) + CONVERT(NVARCHAR,(CHAR(135))) + CONVERT(NVARCHAR,(CHAR(136))) + CONVERT(NVARCHAR,(CHAR(137))) + CONVERT(NVARCHAR,(CHAR(138))) + CONVERT(NVARCHAR,(CHAR(139))) + CONVERT(NVARCHAR,(CHAR(140))) + CONVERT(NVARCHAR,(CHAR(141))) + CONVERT(NVARCHAR,(CHAR(142))) + CONVERT(NVARCHAR,(CHAR(143))) + CONVERT(NVARCHAR,(CHAR(144))) + CONVERT(NVARCHAR,(CHAR(145))) + CONVERT(NVARCHAR,(CHAR(146))) + CONVERT(NVARCHAR,(CHAR(147))) + CONVERT(NVARCHAR,(CHAR(148))) + CONVERT(NVARCHAR,(CHAR(149))) + CONVERT(NVARCHAR,(CHAR(150))) + CONVERT(NVARCHAR,(CHAR(151))) + CONVERT(NVARCHAR,(CHAR(152))) + CONVERT(NVARCHAR,(CHAR(153))) + CONVERT(NVARCHAR,(CHAR(154))) + CONVERT(NVARCHAR,(CHAR(155))) + CONVERT(NVARCHAR,(CHAR(156))) + CONVERT(NVARCHAR,(CHAR(157))) + CONVERT(NVARCHAR,(CHAR(158))) + CONVERT(NVARCHAR,(CHAR(159))) + CONVERT(NVARCHAR,(CHAR(160))) + CONVERT(NVARCHAR,(CHAR(161))) + CONVERT(NVARCHAR,(CHAR(162))) + CONVERT(NVARCHAR,(CHAR(163))) + CONVERT(NVARCHAR,(CHAR(164))) + CONVERT(NVARCHAR,(CHAR(165))) + CONVERT(NVARCHAR,(CHAR(166))) + CONVERT(NVARCHAR,(CHAR(167))) + CONVERT(NVARCHAR,(CHAR(168))) + CONVERT(NVARCHAR,(CHAR(169))) + CONVERT(NVARCHAR,(CHAR(170))) + CONVERT(NVARCHAR,(CHAR(171))) + CONVERT(NVARCHAR,(CHAR(172))) + CONVERT(NVARCHAR,(CHAR(173))) + CONVERT(NVARCHAR,(CHAR(174))) + CONVERT(NVARCHAR,(CHAR(175))) + CONVERT(NVARCHAR,(CHAR(176))) + CONVERT(NVARCHAR,(CHAR(177))) + CONVERT(NVARCHAR,(CHAR(178))) + CONVERT(NVARCHAR,(CHAR(179))) + CONVERT(NVARCHAR,(CHAR(180))) + CONVERT(NVARCHAR,(CHAR(181))) + CONVERT(NVARCHAR,(CHAR(182))) + CONVERT(NVARCHAR,(CHAR(183))) + CONVERT(NVARCHAR,(CHAR(184))) + CONVERT(NVARCHAR,(CHAR(185))) + CONVERT(NVARCHAR,(CHAR(186))) + CONVERT(NVARCHAR,(CHAR(187))) + CONVERT(NVARCHAR,(CHAR(188))) + CONVERT(NVARCHAR,(CHAR(189))) + CONVERT(NVARCHAR,(CHAR(190))) + CONVERT(NVARCHAR,(CHAR(191)))
    + N']%') COLLATE Latin1_General_BIN

以上:

    检测编码 U+0080 至 U+FFFF 的多字节序列(U+0080 至 U+07FF 编码为 110xxxxx 10xxxxx,U+
  • 0800 至 U+FFFF 编码为 1110xxxx 10x 10xxxxxxx 10xxxxx
  • 即,它检测十六进制字节0xC0到0xEF后跟十六进制字节0x80到0xBF
  • 忽略 ASCII 控制字符 U+0000 到 U+001F
  • 忽略已正确编码为 unicode 的字符>= U+0100(即不是 UTF-8)
  • 忽略 unicode 字符 U+0080 到 U+00FF,如果它们看起来不是 UTF-8 序列的一部分,例如"coöperatief"。
  • 由于潜在的排序规则问题,不使用 LIKE "%[X-Y]" 表示 X=0x80 到 Y=0xBF
  • 使用 CONVERT(VARCHAR,CHAR(X)) 而不是 NCHAR,因为我们在 NCHAR 转换为错误值(对于某些值)时遇到了问题。
  • 忽略大于 U+FFFF 的 UTF 字符(4 到 6 个字节序列,其第一个字节为十六进制0xF0到 0xFD)

我做了一个解决方案,通过结合 @robocat 的答案、更多案例和从 https://github.com/benkasminbullock/unicode-c/blob/master/unicode.c 中获取的逻辑,以及从 https://dba.stackexchange.com/questions/139551/how-do-i-set-a-sql-server-unicode-nvarchar-string-to-an-emoji-or-supplementary 编码扩展 unicode 字符的问题的解决方案,也处理 4 字节序列(如表情符号)。 它既不快也不漂亮,但无论如何它都对我有用。 此特定解决方案包括 Unicode 替换字符,只要它发现未知字节。 在这些情况下,最好抛出异常,或者保持字节不变,因为将来的编码可能会关闭,但我更喜欢在我的用例中这样做。

-- Started with https://stackoverflow.com/questions/28168055/convert-text-value-in-sql-server-from-utf8-to-iso-8859-1
-- Modified following source in https://github.com/benkasminbullock/unicode-c/blob/master/unicode.c
-- Made characters > 65535 work using https://dba.stackexchange.com/questions/139551/how-do-i-set-a-sql-server-unicode-nvarchar-string-to-an-emoji-or-supplementary
CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX)) RETURNS NVarChar(MAX) AS
BEGIN
    DECLARE @out NVarChar(MAX), @thisOut NVARCHAR(MAX), @i int, @c int, @c2 int, @c3 int, @c4 int
    SELECT @i = 1, @out = ''
    WHILE (@i <= Len(@in)) BEGIN
        SET @c = Ascii(SubString(@in, @i, 1))
        IF @c <= 0x7F BEGIN
            SET @thisOut = NCHAR(@c)
            SET @i = @i + 1
        END
        ELSE IF @c BETWEEN 0xC2 AND 0xDF BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            IF @c2 < 0x80 OR @c2 > 0xBF BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                SET @thisOut = NCHAR(((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
                SET @i = @i + 2
            END
        END
        ELSE IF @c BETWEEN 0xE0 AND 0xEF BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            SET @c3 = Ascii(SubString(@in, @i + 2, 1))
            IF @c2 < 0x80 OR @c2 > 0xBF OR @c3 < 0x80 OR (@c = 0xE0 AND @c2 < 0xA0) BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                SET @thisOut = NCHAR(((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
                SET @i = @i + 3
            END
        END
        ELSE IF @c BETWEEN 0xF0 AND 0xF4 BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            SET @c3 = Ascii(SubString(@in, @i + 2, 1))
            SET @c4 = Ascii(SubString(@in, @i + 3, 1))
            IF @c2 < 0x80 OR @c2 >= 0xC0 OR @c3 < 0x80 OR @c3 >= 0xC0 OR @c4 < 0x80 OR @c4 >= 0xC0 OR (@c = 0xF0 AND @c2 < 0x90) BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                DECLARE @nc INT = (((@c & 0x07) * 262144 /* << 18 */) | ((@c2 & 0x3F) * 4096 /* << 12 */) | ((@c3 & 0x3F) * 64) | (@c4 & 0x3F))
                DECLARE @HighSurrogateInt INT = 55232 + (@nc / 1024), @LowSurrogateInt INT = 56320 + (@nc % 1024)
                SET @thisOut = NCHAR(@HighSurrogateInt) + NCHAR(@LowSurrogateInt)
                SET @i = @i + 4
            END
        END
        ELSE BEGIN
            SET @thisOut = NCHAR(0xFFFD)
            SET @i = @i + 1
        END
        SET @out = @out + @thisOut
    END
    RETURN @out
END
GO

我添加了一些修改以使用来自SQL Server 2017和2019的新字符串聚合函数string_agg

SELECT @result=STRING_AGG(NCHAR([codepoint]),'') WITHIN GROUP (ORDER BY position ASC) 
FROM codepoints

将 de @result 部分更改为此部分。XML 仍然以旧的方式工作。在 2019 年,string_agg 的工作速度比 XML 版本快得多(明显......string_agg现在是本地的,比较不公平)

这是我为SQL Server 2017编写的内联表值函数(TVF)的版本。 它仅限于 4000 字节的输入字符串,因为这足以满足我的需求。 限制输入大小并写入为 TVF 使此版本比到目前为止发布的缩放器值函数快得多。 它还处理四字节 UTF-8 序列(例如由表情符号创建的序列),这些序列无法用 UCS-2 字符串表示,方法是在其位置输出替换字符。

CREATE OR ALTER FUNCTION [dbo].[fnUTF8Decode](@UTF8 VARCHAR(4001)) RETURNS TABLE AS RETURN
/*  Converts a UTF-8 encoded VARCHAR to NVARCHAR (UCS-2).  Based on UTF-8 documentation on Wikipedia and the 
    code/discussion at https://stackoverflow.com/a/31064459/1979220.
    
    One can quickly detect strings that need conversion using the following expression:
        <FIELD> LIKE CONCAT('%[', CHAR(192), '-', CHAR(255), ']%') COLLATE Latin1_General_BIN. 
    Be aware, however, that this may return true for strings that this function has already converted to UCS-2.
    See robocat's answer on the above referenced Stack Overflow thread for a slower but more robust expression.
    Notes/Limitations
    1)  Written as a inline table-valued function for optimized performance.  
    
    2)  Only tested on a database with SQL_Latin1_General_CP1_CI_AS collation.  More specifically, this was
        not designed to output Supplementary Characters and converts all such UTF-8 sequences to �.
    3)  Empty input strings, '', and strings with nothing but invalid UTF-8 chars are returned as NULL.
    4)  Assumes input is UTF-8 compliant.  For example, extended ASCII characters such as en dash CHAR(150)
        are not allowed unless part of a multi-byte sequence and will be skipped otherwise.  In other words:
            
            SELECT * FROM dbo.fnUTF8Decode(CHAR(150)) -> NULL
    
    5)  Input should be limited to 4000 characters to ensure that output will fit in NVARCHAR(4000), which is
        what STRING_AGG outputs when fed a sequence of NVARCHAR(1) characters generated by NCHAR.  However,
        T-SQL silently truncates overlong parameters so we've declared our input as VARCHAR(4001) to allow 
        STRING_AGG to generate an error on overlong input.  If we didn't do this, callers would never be
        notified about truncation.
    6)  If we need to process more than 4000 chars in the future, we'll need to change input to VARCHAR(MAX) and 
        CAST the CASE WHEN expression to NVARCHAR(MAX) to force STRING_AGG to output NVARCHAR(MAX).  Note that
        this change will significantly degrade performance, which is why we didn't do it in the first place.
    7)  Due to use of STRING_AGG, this is only compatible with SQL 2017.  It will probably work fine on 2019
        but that version has native UTF-8 support so you're probably better off using that.  For earlier versions,
        replace STRING_AGG with a CLR equivalent (ms-sql-server-group-concat-sqlclr) or FOR XML PATH(''), TYPE...
*/
SELECT      STRING_AGG (
                CASE 
                    WHEN A1 & 0xF0 = 0xF0 THEN  --Four byte sequences (like emoji) can't be represented in UCS-2
                        NCHAR(0xFFFD)           --Output U+FFFD (Replacement Character) instead
                    WHEN A1 & 0xE0 = 0xE0 THEN  --Three byte sequence; get/combine relevant bits from A1-A3
                        NCHAR((A1 & 0x0F) * 4096 | (A2 & 0x3F) * 64 | (A3 & 0x3F))
                    WHEN A1 & 0xC0 = 0xC0 THEN  --Two byte sequence; get/combine relevant bits from A1-A2
                        NCHAR((A1 & 0x3F) * 64 | (A2 & 0x3F))
                    ELSE NCHAR(A1)              --Regular ASCII character; output as is
                END
            , '') UCS2
FROM        dbo.fnNumbers(ISNULL(DATALENGTH(@UTF8), 0))
CROSS APPLY (SELECT ASCII(SUBSTRING(@UTF8, I, 1)) A1, ASCII(SUBSTRING(@UTF8, I + 1, 1)) A2, ASCII(SUBSTRING(@UTF8, I + 2, 1)) A3) A
WHERE       A1 <= 127 OR A1 >= 192              --Output only ASCII chars and one char for each multi-byte sequence
GO

请注意,上述内容需要"数字"表或生成器函数。 这是我使用的功能:

CREATE OR ALTER FUNCTION [dbo].[fnNumbers](@MaxNumber BIGINT) RETURNS TABLE AS RETURN
/*  Generates a table of numbers up to the specified @MaxNumber, limited to 4,294,967,296.  Useful for special case
    situations and algorithms.  Copied from https://www.itprotoday.com/sql-server/virtual-auxiliary-table-numbers
    with minor formatting and name changes.
*/
WITH L0 AS (
    SELECT 1 I UNION ALL SELECT 1           --Generates 2 rows
), L1 AS (
    SELECT 1 I FROM L0 CROSS JOIN L0 L      --          4 rows
), L2 AS (
    SELECT 1 I FROM L1 CROSS JOIN L1 L      --          16 rows
), L3 AS (
    SELECT 1 I FROM L2 CROSS JOIN L2 L      --          256 rows
), L4 AS (
    SELECT 1 I FROM L3 CROSS JOIN L3 L      --          65,536 rows
), L5 AS (
    SELECT 1 I FROM L4 CROSS JOIN L4 L      --          4,294,967,296 rows
), Numbers AS (
    SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) I FROM L5
)
SELECT TOP (@MaxNumber) I FROM Numbers ORDER BY I
GO

我刚刚成功地创建了一个新字段作为 varchar(255) 并将新字段设置为旧字段 nvarchar(255)。这产生了国际场所的"美国化"版本。

更新世界城市设置admin_correct = admin_name

 varchar(255)     nvarchar(255)

我找到了我需要做的查询,只是还没有编码。

ALTER TABLE dbo.MyTable ALTER COLUMN CharCol
        varchar(10)COLLATE Latin1_General_CI_AS NOT NULL;

最新更新