我有一个文件,每行都包含一个字符串。
我需要找到最大的N行。
我能找到行数最多的行。例如,大-1可以通过以下方式找到:
cat ./test.txt | awk '{ print length }' | sort -n | tail -1
或者可以通过找到最大的-10
cat ./test.txt | awk '{ print length }' | sort -n | tail -10
但我还需要使用shell脚本并排获得这些行的行号。
感谢您的帮助。
输入文件中的一些行:
testTweeVaderOverLedenInNLPeriodeGeboorte ( ) { java . util . List < nl . bzk . brp . model . objecttype . operationeel . BetrokkenheidModel > echtgenoten = java . util . Arrays . asList ( maakBetrokkenheden ( 20110701 , 20120101 , ( ( short ) ( 1 ) ) ) , maakBetrokkenheden ( 20120201 , 20120504 , ( ( short ) ( 1 ) ) ) ) ; org . mockito . Mockito . when ( relatieRepository . haalOpBetrokkenhedenVanPersoon ( org . mockito . Matchers . any ( nl . bzk . brp . model . objecttype . operationeel . PersoonModel . class ) , org . mockito . Matchers . any ( nl . bzk . brp . dataaccess . selectie . RelatieSelectieFilter . class ) ) ) . thenReturn ( echtgenoten ) ; org . mockito . Mockito . when ( persoonRepository . haalPersoonOpMetAdresViaBetrokkenheid ( echtgenoten . get ( 0 ) ) ) . thenReturn ( echtgenoten . get ( 0 ) . getBetrokkene ( ) ) ; org . mockito . Mockito . when ( persoonRepository . haalPersoonOpMetAdresViaBetrokkenheid ( echtgenoten . get ( 1 ) ) ) . thenReturn ( echtgenoten . get ( 1 ) . getBetrokkene ( ) ) ; java . util . List < nl . bzk . brp . model . objecttype . operationeel . PersoonModel > kandidaten = kandidaatVader . bepaalKandidatenVader ( new nl . bzk . brp . model . objecttype . operationeel . PersoonModel ( new nl . bzk . brp . model . objecttype . bericht . PersoonBericht ( ) ) , new nl . bzk . brp . model . attribuuttype . Datum ( 20120506 ) ) ; org . mockito . Mockito . verify ( persoonRepository , org . mockito . Mockito . times ( 2 ) ) . haalPersoonOpMetAdresViaBetrokkenheid ( ( ( nl . bzk . brp . model . objecttype . operationeel . BetrokkenheidModel ) ( org . mockito . Matchers . any ( ) ) ) ) ; "<AssertPlaceHolder>" ; } size ( ) { return elementen . size ( ) ; }
putListeners ( ) { final java . util . concurrent . atomic . AtomicInteger counter = new java . util . concurrent . atomic . AtomicInteger ( ) ; map . addListener ( new LRUMap . ModificationListener < java . lang . String , java . lang . Integer > ( ) { @ java . lang . Override public void onPut ( java . lang . String key , java . lang . Integer value ) { counter . incrementAndGet ( ) ; } @ java . lang . Override public void onRemove ( java . lang . String key , java . lang . Integer value ) { } } ) ; map . put ( "hello" , 1 ) ; map . put ( "hello2" , 2 ) ; "<AssertPlaceHolder>" ; } put ( java . lang . String , org . codehaus . httpcache4j . List ) { return super . put ( new org . codehaus . httpcache4j . util . CaseInsensitiveKey ( key ) , value ) ; }
testStatelessKieSession ( ) { org . kie . api . runtime . StatelessKieSession ksession = ( ( org . kie . api . runtime . StatelessKieSession ) ( org . kie . spring . tests . KieSpringComponentScanTest . context . getBean ( "ksession1" ) ) ) ; "<AssertPlaceHolder>" ; }
shouldHashSha1 ( ) { java . lang . String [ ] correctHashes = new java . lang . String [ ] { "da39a3ee5e6b4b0d3255bfef95601890afd80709" , "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8" , "285d0c707f9644b75e1a87a62f25d0efb56800f0" , "a42ef8e61e890af80461ca5dcded25cbfcf407a4" } ; java . util . List < java . lang . String > result = new java . util . ArrayList ( ) ; for ( java . lang . String password : fr . xephi . authme . security . HashUtilsTest . GIVEN_PASSWORDS ) { result . add ( fr . xephi . authme . security . HashUtils . sha1 ( password ) ) ; } "<AssertPlaceHolder>" ; } contains ( java . lang . String ) { return ( getObject ( path ) ) != null ; }
equalsOtherNullReturnsFalse ( ) { com . rackspacecloud . blueflood . types . BluefloodCounterRollup rollup = new com . rackspacecloud . blueflood . types . BluefloodCounterRollup ( ) ; "<AssertPlaceHolder>" ; } equals ( java . lang . Object ) { if ( ! ( obj instanceof com . rackspacecloud . blueflood . rollup . Granularity ) ) return false ; else return obj == ( this ) ; }
testFlatten ( ) { org . teiid . translator . document . Document doc = new org . teiid . translator . document . Document ( ) ; doc . addProperty ( "B" 4 , "AA" ) ; doc . addProperty ( "B" , "B" 2 ) ; org . teiid . translator . document . Document c1 = new org . teiid . translator . document . Document ( "c1" , false , doc ) ; c1 . addProperty ( "B" 1 , "11" ) ; org . teiid . translator . document . Document c2 = new org . teiid . translator . document . Document ( "c1" , false , doc ) ; c2 . addProperty ( "B" 3 , "B" 7 ) ; doc . addChildDocuments ( "c1" , java . util . Arrays . asList ( c1 , c2 ) ) ; org . teiid . translator . document . Document c4 = new org . teiid . translator . document . Document ( "c2" , false , doc ) ; c4 . addProperty ( "4" , "B" 0 ) ; org . teiid . translator . document . Document c5 = new org . teiid . translator . document . Document ( "c2" , false , doc ) ; c5 . addProperty ( "5" , "B" 6 ) ; doc . addChildDocuments ( "c2" , java . util . Arrays . asList ( c4 , c5 ) ) ; java . util . List < java . util . Map < java . lang . String , java . lang . Object > > result = doc . flatten ( ) ; java . util . List < java . util . Map < java . lang . String , java . lang . Object > > expected = java . util . Arrays . asList ( map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/1" , "11" , "B" 5 , "B" 0 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/2" , "B" 7 , "B" 5 , "B" 0 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/1" , "11" , "c2/5" , "B" 6 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/2" , "B" 7 , "c2/5" , "B" 6 ) ) ; "<AssertPlaceHolder>" ; } toArray ( ) { return java . util . Arrays . copyOf ( elementData , size ) ; }
testSetUnread ( ) { contact . setUnread ( 1 ) ; "<AssertPlaceHolder>" ; } getUnread ( ) { return unread ; }
testOracleDatabase ( ) { try { java . lang . String expectedSQL = ( org . pentaho . di . core . database . SelectCountIT . NonHiveSelect ) + ( org . pentaho . di . core . database . SelectCountIT . TableName ) ; org . pentaho . di . core . database . DatabaseMeta databaseMeta = new org . pentaho . di . core . database . DatabaseMeta ( org . pentaho . di . core . database . SelectCountIT . OracleDatabaseXML ) ; java . lang . String sql = databaseMeta . getDatabaseInterface ( ) . getSelectCountStatement ( org . pentaho . di . core . database . SelectCountIT . TableName ) ; "<AssertPlaceHolder>" ; } catch ( java . lang . Exception e ) { e . printStackTrace ( ) ; } } getSelectCountStatement ( java . lang . String ) { if ( ( databaseDialect ) != null ) { return databaseDialect . getSelectCountStatement ( tableName ) ; } return super . getSelectCountStatement ( tableName ) ; }
预期输出:
linenumber,length
5,5000
10,3850
2,2000
...
使用每个Unix盒子上存在的这些工具的任何版本:
$ awk -v OFS=',' '{print NR, length($0)}' file | sort -t, -rnk2 | head -n 5
1,1717
6,1649
8,883
2,762
4,656
如果你真的想要的话,只需在开头添加echo 'linenumber,length';
即可获得标题行。
以上是输出5行而不是10行,以演示对最大-N的选择,因为OP只提供了8行采样输入。
使用GNU awk:
$ gawk '{
a[NR]=length
}
END {
PROCINFO["sorted_in"]="@val_num_desc" # this line is GNU awk only
for(i in a)
print i,a[i]
}' file
输出:
1 1717
6 1649
8 883
2 762
4 656
5 375
3 268
7 107
如果你没有GNU awk,但有其他一些awk和sort
管道输出到:
$ awk ... | sort -k2nr
您可以在下面尝试。
cat input.txt|awk 'BEGIN{print "linenumber,length"};{size[NR] = length;}END{for(lineNumber in size) print lineNumber","size[lineNumber]}'|sort -k2 -rn|head -10
更新2:基准
在上面的另一个解决方案中添加了Top-10-only过滤器后,以下是相同输入集的情况-11.896秒
fgc; ( time ( pvE0 < "${m3t}" |
gawk '{
a[NR]=length
}
END {
PROCINFO["sorted_in"]="@val_num_desc" # this line is GNU awk only
for(i in a) {
print i,a[i]; if (9 < ++_) { break } } }' ) |
pvE9 ) | gcat -b | lgp3 5
in0: 1.85GiB 0:00:08 [ 228MiB/s] [ 228MiB/s] [=>] 100%
out9: 0.00 B 0:00:11 [0.00 B/s] [0.00 B/s] [<=>]
1 6954837 18458
2 11417380 14247
3 6654331 11188
4 7576850 10352
5 12262953 10182
6 12279191 10156
7 12329231 9679
8 11479085 9568
9 12329230 9400
10 12418983 8666
out9: 143 B 0:00:11 [12.0 B/s] [12.0 B/s] [<=> ]
( pvE 0.1 in0 < "${m3t}" | gawk ; )
11.49s user 0.77s system 103% cpu 11.896 total
更新1:
如果您愿意大胆尝试,并假设输入从一开始就是完全有效的UTF-8
,那么通过添加这个小函数,它可以报告行号、字节计数以及UTF-8
字符计数
=============================function _______(_) { # only applicable for non-unicode aware awks _=$(_<_) gsub("[\200-\301\365-\377]+","",_) return length(_) }
muuuuuuuuuch在
awk
中完成这项工作的速度更快,甚至在1.2 secs
中都无法完成一个充满utf-8
:的1.85 GB
文本文件
它不是存储每一行,而是只在最短的现有条目被击倒到第11位时更新10项数组中的条目
由于平局有利于现有条目,因此在所有中都会少量更新数组
它还将最短的条目临时存储到变量中,这比测试文件中1200万行中的每一行读取和写入数组的开销要小得多
|
fgc; ( time ( pvE0 < "${m3t}" | mawk2 ' function ______(___,_,__,____,_____) { __=(_=3)^_^_ _____="" for(____ in ___) { _____=__==(__=+(_=___[____])<+__ ?_:__) ?_____:____ } return _____ } BEGIN { split(sprintf("%0*.f", (__=10)-!_,_),___,_) _____=___[+_] = _*= FS = "^$" } _____<(____=length($!__)) { ___[_]=____ "_|_LineNum_::_"NR _____=+___[_=______(___)] } END { for(____=__+_;_<____;_++) { print "index :: ",_%__,"_length :: ",___[_%__] } } ' )) sleep 1 ( time ( pvE0 < "${m3t}" | mawk2 '{ print length($0),NR }' OFS== ) | LC_ALL=C gsort -t= -k 1,1nr -k 2,2nr ) | gsed -n '1,10p;10q' | gsort -t= -k 1,1n | gcat -b | rs -t -c$'n' -C= 0 3 | column -s= -t in0: 1.85GiB 0:00:01 [1.64GiB/s] [1.64GiB/s] [============>] 100% 0.93s user 0.42s system 117% cpu 1.145 total 1 index :: 2 length :: 16024 | LineNum :: 12417761 2 index :: 3 length :: 16033 | LineNum :: 12418983 3 index :: 4 length :: 22261 | LineNum :: 11417380 4 index :: 5 length :: 20574 | LineNum :: 6654331 5 index :: 6 length :: 20714 | LineNum :: 12329231 6 index :: 7 length :: 20077 | LineNum :: 12329230 7 index :: 8 length :: 18870 | LineNum :: 3781376 8 index :: 9 length :: 16801 | LineNum :: 9000781 9 index :: 0 length :: 25891 | LineNum :: 6954837 10 index :: 1 length :: 16051 | LineNum :: 11479085 in0: 1.85GiB 0:00:07 [ 247MiB/s] [ 247MiB/s] [=========>] 100% 1 16024 12417761 5 18870 3781376 9 22261 11417380 2 16033 12418983 6 20077 12329230 10 25891 6954837 3 16051 11479085 7 20574 6654331 4 16801 9000781 8 20714 12329231 2.63s user 0.42s system 39% cpu 7.681 total 5.85s user 0.51s system 72% cpu 8.808 total