ACTCD19: 2-Gram Token Model of C/C++

115 views
Skip to first unread message

Andrew Tomazos

unread,
Mar 21, 2019, 8:43:13 PM3/21/19
to std-pr...@isocpp.org
I'm releasing an artifact from ACTCD19 today which is a 2 token model of C/C++.

In short, I took all the ~1 billion lines of C/C++ from a popular linux distribution package archive (Debian Sid), tokenized it, found the most common 2^16 tokens and then looked at consecutive pairs of those common tokens.

For each unique consecutive pair of tokens I counted the number of occurences of the token pair - and then ranked them by number of occurences - printing the list to a text file.

The full list is here: https://github.com/tomazos/actcd19  (23 MB)

For a sample the top 100 is here:

OCCURENCES TOKEN1 TOKEN2
164945669 ) ;
67929216 ; }
65467334 ) {
53726432 if (
51458279 ( )
51424109 ) )
43881403 , 0
41003732 0 ,
30636098 ; if
24879109 ) ,
22700384 , {
22126128 } ,
18481475 # define
18209227 0 ;
18084000 , -
16609718 ( (
16418272 } }
16280145 0x00 ,
16157455 , 0x00
15373250 = 0
15177226 # include
15131545 0 )
12608580 ] =
12443840 ; return
11826846 1 ,
11702558 , &
11003066 ( !
10700212 * )
10334144 char *
9977210 = (
9785911 - 1
9164656 { if
9155554 , 0x00000000
9107322 1 )
8712605 ( &
8712604 , 1
8551953 0x00000000 ,
8259921 break ;
8194044 60 ,
8145750 , 60
8091734 } else
7982929 ] ;
7851333 ; int
7782848 ( const
7715721 , const
7706712 ) (
7449936 # endif
7221090 for (
7113503 { return
7076402 1 ;
7060939 std ::
6988426 } if
6926199 ( *
6922312 ) return
6869464 , (
6808548 [ i
6741089 ; #
6707967 } ;
6480366 ; case
6409883 0 ]
6391962 } static
6359919 i ]
6309078 [ 0
6247162 , NULL
6226628 ] )
6127050 , int
6086446 ] .
5899953 } void
5893746 ) #
5885488 ; i
5827183 NULL )
5599382 ) .
5587185 ; break
5556216 ; static
5319908 1 ]
5186289 else {
4971658 ) ->
4959459 ; for
4938293 2 ,
4923439 ( int
4907318 ( void
4881240 NULL ;
4805595 const char
4773166 sizeof (
4757621 ) ==
4750206 ; void
4684770 ) const
4643800 , 2
4605787 void *
4576513 NULL ,
4559738 ( struct
4548680 , 0x0000
4509706 0x0000 ,
4494806 ] ,
4491637 > (
4369493 ++ )
4179616 ( 0
3917628 ( i
3914781 [ 1
3846912 i =


Reply all
Reply to author
Forward
0 new messages