C中的连续字符串替换



请参阅以下代码,该代码通过循环遍历所有要替换的utf8字符来执行连续的字符/字符串替换;你会提出另一个更高效的结构吗?

static char *utf8[66] =  { "◊",    "⎕",    "⍞",    "⌹",    "⊤",    "⊥",
"⌶",    "⌈",    "∪",    "⍕",    "⍎",    "│",
"⍟",    "∆",    "∇",    "→",    "←",    "⌊",
"┐",    "└",    "─",    "↑",    "↓",    "≡",
"⍸",    "⋸",    "∵",    "⌷",    "⍂",    "⌻",
"⊣",    "⊢",    "⋄",    "┘",    "┌",    "⍺",
"⊂",    "⊃",    "⍝",    "⍲",    "⍴",    "⍱",
"⌽",    "⊖",    "○",    "∨",    "⍳",    "⍬",
"∈",    "∩",    "⌿",    "⍀",    "≥",    "≤",
"≠",    "×",    "÷",    "⍙",    "∘",    "⍵",
"⍫",    "⍋",    "⍒",    "¯",    "¨",    NULL    };
static char *ebcdic[66] = { "x8d", "x90", "x91", "x92", "x98", "x9d",
"x9f", "xa9", "xac", "xae", "xaf", "xb3",
"xb5", "xb6", "xb7", "xb8", "xbd", "xbe",
"xbf", "xc0", "xc4", "xc6", "xc7", "xcf",
"xd0", "xd1", "xd2", "xd3", "xd4", "xd5",
"xd6", "xd7", "xd8", "xd9", "xda", "xe0",
"xe2", "xe3", "xe4", "xe5", "xe6", "xe7",
"xe8", "xe9", "xea", "xeb", "xec", "xed",
"xee", "xef", "xf0", "xf1", "xf2", "xf3",
"xf4", "xf5", "xf6", "xf7", "xf8", "xf9",
"xfa", "xfb", "xfc", "xfd", "xfe", NULL    };
char* convert(char *line) {
char *buffer1;
char *buffer2;
char *tmp;
int i=0;
buffer1 = malloc(strlen(line));
strcpy(buffer1, line);
while(ebcdic[i]) {
buffer2 = replace(buffer1, utf8[i], ebcdic[i]);
free(buffer1);
buffer1 = malloc(strlen(buffer2));
strcpy(buffer1, buffer2);
}
tmp = malloc(strlen(buffer1 + 1));
sprintf(tmp, "%sn", buffer1);
free(buffer1);
free(buffer2);
return tmp;
}
char* replace(const char* s, const char* oldW, const char* newW) {
char* result;
int i, cnt = 0;
int newWlen = strlen(newW);
int oldWlen = strlen(oldW);
for (i = 0; s[i] != ''; i++) {
if (strstr(&s[i], oldW) == &s[i]) {
cnt++;
i += oldWlen - 1;
}
}
result = (char*)malloc(i + cnt * (newWlen - oldWlen) + 1);
i = 0;
while (*s) {
if (strstr(s, oldW) == s) {
strcpy(&result[i], newW);
i += newWlen;
s += oldWlen;
} else {
result[i++] = *s++;
}
}
result[i] = '';
return result;
}
  • 更新-001:为replace((添加了代码
  • update-002:已将for/loop更改为while

感谢您的关注,在这种特殊情况下,我更关心可读性和内存使用,而不是性能。

我假设您正试图将编写该代码作为一种学习体验,否则请使用现有的工具/库。

当你想转换字符/代码点时,基本算法如下:

从输入字符串中获取下一个代码点,转换该代码点(或保持原样(,将转换后的代码点存储在输出字符串的末尾。重复

由于输入字符串每个代码点使用一个char,因此获取下一个代码点就像循环输入字符串中的"char"一样简单。这也意味着码点转换可以由大小为256的简单查找表使用(假设为8位chars(。utf8码点的长度不一定是1,所以我们必须考虑到这一点。

/* This syntax just means that the array is intialized with 
ebdic2utf8_lut[0x8d] = "◊", ebdic2utf8_lut[0x90] = "⎕", etc.
Array elements that are not explicit assigned in the initialization
list will be initialized to `0` (or NULL) 
We may treat array elements with value `0` as "keep as is" */
static const char *const ebdic2utf8_lut[256] = {
[0x8d] = "◊",
[0x90] = "⎕",
[0x91] = "⍞",
[0x92] = "⌹",
/* Rest of initializations left out for brevity */
};
char * convert(const char *src)
{
/* Allocate space for empty string*/
char *dst = calloc(1, 1);
if (!dst)
{
perror("String allocation failed");
exit(1);
}
size_t dst_length = 0;
while(*src)
{
/* We want to lose the sign of `char` for the lut */
unsigned char ch = *src;
/* Convert next character */
const char *utf8 = ebdic2utf8_lut[ch];
/* If there is no conversion we keep it as is
But the rest of the function works with strings, so we
put the input character in a string with length 1 */
char keep_as_is[2];
if (!utf8)
{
keep_as_is[0] = ch;
keep_as_is[1] = ''; /* Zero termination */
utf8 = keep_as_is;
}
size_t utf8_length = strlen(utf8);
size_t new_dst_length = dst_length + utf8_length;
/* Resize destination string to allow for appending 
(including zero-termination) */
char *new_dst = realloc(dst, new_dst_length + 1);
if (!new_dst)
{
perror("String resize failed");
exit(1);
}
dst = new_dst;
/* Append converted character to destination string*/ 
strcpy(dst + dst_length, utf8);

dst_length = new_dst_length;
src++;
}
return dst;
} 
int main(void)
{
char *str = convert("Hellox90x91x92World");
/* This should print "Hello⎕⍞⌹World", unless you are under Windows.
Windows and utf-8 doesn't mix very well */
printf("%sn", str);
}

此代码未经检查是否存在错误、使用风险自负等。

要从utf8转换为ebdic,仍然可以使用lookuptable。但是具有从utf8->ebdic由于体积太大而不切实际。但是我们可以使用ebdic->utf8LUT并循环通过它来找到匹配项。

字符串转换的基本算法,基本上还是一样的,得到输入字符串中的下一个utf8代码点/字符,将其转换为ebdic,将转换后的字符推送到输出字符串(如果没有找到转换,则推送一个字符(,从输入字符串中删除前缀,重复。

当执行utf8->ebdic转换,我们必须注意一个utf8码点可能是多个字节。因此,我们必须比较输入字符串前缀中的多个字节,并且我们还必须用多个字节来增加输入字符串。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

static const char *const ebdic2utf8_lut[256] = {
[0x8d] = "◊", [0x90] = "⎕", [0x91] = "⍞", [0x92] = "⌹",
[0x98] = "⊤", [0x9d] = "⊥", [0x9f] = "⌶", [0xa9] = "⌈",
[0xac] = "∪", [0xae] = "⍕", [0xaf] = "⍎", [0xb3] = "│",
[0xb5] = "⍟", [0xb6] = "∆", [0xb7] = "∇", [0xb8] = "→",
[0xbd] = "←", [0xbe] = "⌊", [0xbf] = "┐", [0xc0] = "└",
[0xc4] = "─", [0xc6] = "↑", [0xc7] = "↓", [0xcf] = "≡",
[0xd0] = "⍸", [0xd1] = "⋸", [0xd2] = "∵", [0xd3] = "⌷",
[0xd4] = "⍂", [0xd5] = "⌻", [0xd6] = "⊣", [0xd7] = "⊢",
[0xd8] = "⋄", [0xd9] = "┘", [0xda] = "┌", [0xe0] = "⍺",
[0xe2] = "⊂", [0xe3] = "⊃", [0xe4] = "⍝", [0xe5] = "⍲",
[0xe6] = "⍴", [0xe7] = "⍱", [0xe8] = "⌽", [0xe9] = "⊖",
[0xea] = "○", [0xeb] = "∨", [0xec] = "⍳", [0xed] = "⍬",
[0xee] = "∈", [0xef] = "∩", [0xf0] = "⌿", [0xf1] = "⍀",
[0xf2] = "≥", [0xf3] = "≤", [0xf4] = "≠", [0xf5] = "×",
[0xf6] = "÷", [0xf7] = "⍙", [0xf8] = "∘", [0xf9] = "⍵",
[0xfa] = "⍫", [0xfb] = "⍋", [0xfc] = "⍒", [0xfd] = "¯",
[0xfe] = "¨" };

/* Match an utf8 string with prefix in `str` and return the corresponding
ebdic character */
char utf8lookup(const char *str, size_t *increment)
{
for (size_t n = 0; n < 256; n++)
{
const char *utf8 = ebdic2utf8_lut[n];
if (utf8)
{
size_t len = strlen(utf8);
if (strncmp(str, utf8, len) == 0)
{
*increment = len;
return (char)n;
}
}
}
return 0;
}

char * convert_u2e(const char *src)
{
/* Allocate space for empty string*/
char *dst = calloc(1, 1);
if (!dst)
{
perror("String allocation failed");
exit(1);
}
size_t dst_length = 0;
while(*src)
{
/* Convert next character */
size_t src_increment;
char ch = utf8lookup(src, &src_increment);
/* If there is no conversion we the first character int `src` as is */
if (!ch)
{
ch = *src;
src_increment = 1;
}
size_t new_dst_length = dst_length + 1;
/* Resize destination string to allow for appending 
(including zero-termination) */
char *new_dst = realloc(dst, new_dst_length + 1);
if (!new_dst)
{
perror("String resize failed");
exit(1);
}
dst = new_dst;
/* Append converted character to destination string*/ 
dst[dst_length] = ch;
dst_length = new_dst_length;
src += src_increment;
}
/* Zero terminate */
dst[dst_length] = '';

return dst;
} 
int main(void)
{
char *str = convert_u2e("Hello⎕⍞⌹World");
for (const char *s = str; *s; s++)
printf("%hhx ", (unsigned char)*s);
printf("n");
free(str);
}

最新更新