使用 C++ 输入的 Unicode 字符的索引和直方图

计算每个符号的出现次数以及它们在文本、单词或行中出现的位置我有一个许多语言的单词列表。

我试图做的是计算每个字符的出现次数以及它们在文本中的位置或常见位置。此外，如果可以计算常见的音节数，那也会有所帮助。

sommige
disa
بَعْض - ba'th
mi qani - մի քանի
bəzi
batzuk
nyeykі/nyeykaya/nyeykaye/nyeykіya - нейкі/нейкая/нейкае/нейкія
kisu - কিসু
afouhe - بعض
neki
alguns
njakoj - някой
一些
algú/alguns/alguna/algunes
neki
někteří
nogle
berekhey āz - برخی از
een paar
kam - كام
some
iuj
mõned
berekhey āz - برخی از
ilan
joitakin
sommige
certains
algúns
ramdenime - რამდენიმე
einige
peripou - περίπου
keṭelāk - કેટલાક 
wasu
kèk
khemeh - כמה
kuch - कुछ
néhány
sumir
beberapa
roinnt
alcuni
ikutsu ka no - いくつかの
kelavu
មួយចំនួន
조금 - jo geum
هەندێک
aliquis
daži
keli
nekoi - некои
misy
beberapa
ഏതാനും
xi
yī xiē  - 一些
kaahi - कांही
neki
shwiya - بعض
kehi - केही
enkelte
gari
berekhey āz - برخی از
b'eda - بعضی
kilka
ਕਈ
alguns
câţiva/câteva
некоторые - nekotorыe
some
neki - неки
samahara - සමහර
niektorí
nekaj
algunos
baadhi
några
ilan
yakchand - якчанд
konjam - கொஞ்சம்
yan
konni - కొన్ని
บาง - baang
bazı
dejakі - деякі
chened - چند
ba'zi, qandaydir
một số
rhai
עטלעכע
die
okumbalwa

这是当前的代码Sehe使其与Unicode一起工作

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif
using namespace std;
std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};
int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");
#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);
    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());
    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

这是我的原始代码

 #include <iostream>
  #include <cctype>
 #include <fstream>
#include <string>
 #include <map>
  #include <istream>
   #include <vector>
 #include <list>
 #include <algorithm>
#include <iterator>

using namespace std;
 struct letter_only: std::ctype<char> 
 {
    letter_only(): std::ctype<char>(get_table()) {}
    static std::ctype_base::mask const* get_table()
    {
       static std::vector<std::ctype_base::mask> 
             rc(std::ctype<char>::table_size,std::ctype_base::space);
       std::fill(&rc['A'], &rc['z'+1], std::ctype_base::alpha);
       return &rc[0];
    }
 };
struct Counter
{
    std::map<char, int> letterCount;
    void operator()(char  item) 
    { 
       if ( item != std::ctype_base::space)
         ++letterCount[tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
    operator std::map<char, int>() { return letterCount ; }
};
int main()
{
     ifstream input;
     input.imbue(std::locale(std::locale(), new letter_only())); //enable reading only leters only!
     input.open("T");
     istream_iterator<char> start(input);
     istream_iterator<char> end;
     std::map<char, int> letterCount = std::for_each(start, end, Counter());
     for (std::map<char, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
     {
          cout << it->first <<" : "<< it->second << endl;
     }
 }

我试图得到什么的例子

к : 10 (2,5) (1,5,8) (2,7) (1,3,5)

找到的字母 K 然后找到它的出现次数 10 然后是每个单词中发现它的位置，如前所述。

这是我得到的，它似乎在我的机器¹ 上运行良好。

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif
using namespace std;
std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};
int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");
#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);
    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());
    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

^{如果您更喜欢 boost 语言环境库，则需要链接到 boost_system、boost_locale 和 boost_thread;我没有看到任何明显的行为差异}

输出：

' : 3 , : 1 - : 32 / : 10 a : 67 b : 16 c : 7
d : 12 e : 61 f : 1 g : 16 h : 17 i : 46 j : 8
k : 41 l : 19 m : 19 n : 47 o : 20 p : 5 q : 3
r : 18 s : 21 t : 12 u : 21 v : 3 w : 3 x : 2
y : 21 z : 7 á : 1 â : 2 å : 1 è : 1 é : 1
í : 2 õ : 1 ú : 2 ā : 4 ē : 1 ě : 1 ī : 1
ı : 1 ř : 1 ţ : 1 ž : 1 ə : 1 ί : 1 ε : 1
ο : 1 π : 2 ρ : 1 υ : 1 а : 3 д : 2 е : 10
и : 2 й : 5 к : 10 н : 9 о : 4 р : 1 т : 1
ч : 1 ы : 2 я : 5 і : 6 ա : 1 ի : 2 մ : 1
ն : 1 ք : 1 ה : 1 ט : 1 כ : 2 ל : 1 מ : 1
ע : 3 ا : 4 ب : 7 خ : 3 د : 2 ر : 3 ز : 3
ض : 4 ع : 4 ك : 1 م : 1 ن : 2 ه : 1 َ : 1
 : 1 چ : 1 ک : 1 ی : 4 ێ : 1 ە : 1 ं : 1
ी : 2 ु : 1 े : 1 ক : 1 ি : 1 ু : 1 ਕ : 1
ક : 2 ટ : 1 ે : 1 க : 1 ் : 2 క : 1 ి : 1
 : 1 ഏ : 1 ു : 1 ර : 1 ස : 1 ง : 1 า : 1
ა : 1 დ : 1 ე : 2 ი : 1 მ : 2 ნ : 1 რ : 1
ច : 1 ន : 2 ម : 1 យ : 1 ួ : 2 ំ : 1 ố : 1
ộ : 1 い : 1 か : 1 く : 1 の : 1 一 : 2 些 : 2
금 : 1

¹.我可能无法显示所有字符，但可能是由于我的终端字体。

这是我在所有其他人的帮助下让它做的事情，谢谢。

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#include <sstream>
using namespace std;
struct op {
    int O;
    wstring P;
};
template <typename T>
wstring NumberToString ( T Number )
{
    wstringstream ss;
    ss << Number;
    ss<<",";
    return ss.str();
}
std::map<wchar_t, struct op> letterCount;
void Counter(wchar_t  item) {
    if ( !std::isspace(item) ) {
        ++letterCount[std::tolower(item)].O;    //remove tolower if you want case-sensitive solution!
    }
}
int main()
{
    wchar_t * cline;
    wstring line;
    wchar_t const* B = L"(";
    wchar_t const* E = L")";
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("T");
    std::locale loc("en_US.UTF-8");
    input.imbue(loc);
    wcout.imbue(loc);
    if (input.is_open()) {
        while ( !input.eof() ) {
            wstring check;
            getline (input,line);
            wcout << line << endl;
            cline = new wchar_t [line.size()+1];
            wcscpy (cline, line.c_str());
            for (int i = 0; i  < line.size()+1; ++i) {
                Counter(cline[i]);
                if(check.find(cline[i]) == string::npos)
                    for (int j=0; j<line.size()+1; j++) {
                        if (j == 0) {
                            letterCount[std::tolower(cline[i])].P+= B;
                        }
                        if (j == line.size()) {
                            letterCount[std::tolower(cline[i])].P+= E;
                        }
                        if(cline[i]==cline[j]) {
                            letterCount[std::tolower(cline[i])].P+= NumberToString(j) ;
                            check +=cline[i];
                        }
                    }
            }
        }
        input.close();
    }
    for (std::map<wchar_t, struct op>::iterator it = letterCount.begin(); it != letterCount.end(); ++it) {
        wcout << it->first <<" : "<< it->second.O << "@" << it->second.P<< endl;
    }
}

输出：

н : 9@(36,42,49,56,)(9,)(8,)(0,)(7,)(15,)
о : 4@(12,)(11,)(3,5,)
р : 1@(6,)
т : 1@(4,)
ч : 1@(13,)
ы : 2@(7,19,)
я : 5@(47,61,)(10,)(11,)(11,)
і : 6@(5,30,40,60,)(5,13,)
ա : 1@(14,)
ի : 2@(11,16,)
մ : 1@(10,)
ն : 1@(15,)
ք : 1@(13,)

#include <iostream>
#include <stdio>
#include <conio>
main()
{
 char name[1000],temp[1000];
 int i,j,n,present,count=0;
 clrscr();
 cout<<"Enter your char length:-";
 cin>>n;
 cout<<"nEnter your text below:-nn";
 //get the text
 for(i=0;i<n;i++)
 {
  name[i]=getchar();
  temp[i]='';        //clear temp array
 }
 //extracting unique characters to temp[]
 temp[0]=name[0];
 for(i=1;i<n;i++)
 {
    present=0;
    for(j=0;j<strlen(temp);j++)
    {
      if(name[i]==temp[j])
      {
        present=1;
        break;
      }
    }
    if(present==0)
    {
     count++;
     temp[count]=name[i];
    }
 }
//counting char occurance
for (i=0;i<strlen(temp); i++)
{
   int count=0;
   cout<<"n(";
   for (int j=0;j<n; j++)
   {
      if(temp[i]==name[j])
      {
        count++;
        cout<<j<<",";
      }
   }
   cout<<")tt"<<temp[i]<<":"<<count;
}
getch();
}

你应该研究霍夫曼编码：NIST on Huffman Coding

这样，您不仅可以获得所有出现的字母，还可以常见的音节数（如果我理解正确的话，什么是意思是这个意思）。

霍夫曼算法通常用于压缩和搜索树，但它会解决你在开车时的问题（因为这正是霍夫曼所做的）。

Codeproject上已经有一个C++实现：霍夫曼在C++您只需要其中的一部分，因为您可能对压缩不感兴趣。

相关内容

最新更新

热门标签：