Below is some code ( Jeff-Relf.Me/X.CPP ) I wrote
to convert HTML to plain text. ( emails, websites, posts )
As simple as it is, slrn can't do it.
Forte Agent tries but fails; it's mostly about NZB files, videos.
Mozilla Thunderbird is flawless, Win Live Mail is OK⋅ish.
I use one of three browsers, depending on what I want to do.
1: X.EXE ( X.CPP, like Lynx but with images )
2: FireFox ( NoScript, Jeff-Relf.Me/userContent.CSS.TXT, etc. )
3: Internet Explorer ( for animations/crap )
typedef wchar_t *LnP ; typedef LnP *LnA ;
Sh( LnP S, ... ); // Like "printf()", for my custom console.
struct LnT { LnA BB, PP, EE ; };
// The console lines, a dynamic array of pointers.
LnT Ln ;
// Allocate from a temporary heap; never use "free()" or "realloc()".
LnP MallocTmp( int Sz ) { LnP M ;
if ( Sz <= 0 || ( M = (LnP)HeapAlloc( Heap, 0, Sz ), !M ) )
exit(1); // Panic ! ( never happens [ borked heap ] )
return M ; }
// Grow the console line ( *Ln.PP )
// Sz² characters ( from B², as passed ).
Cat( LnP B², int Sz² ) { if ( Sz² <= 0 ) return;
LnP &B = *Ln.PP, B³ ; int Sz³ = szStr( B ) + Sz² + 1 ;
B³ = MallocTmp( Sz³ * szChr ),
Str( B³, L"%s%.*s", B, Sz², B² ), B = B³ ; }
Cat( wchar_t Ch ) { Cat( &Ch, 1 ); }
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
// Handy macros.
#define LOOP while(1)
#define Loop( N ) int J = - 1, LLL = N ; while( ++J < LLL )
#define While( Should_Loop ) Ch = 1, P-- ; \
while( Ch−1 = Ch, Ch && ( Ch = *++P, Ch2 = !Ch ? 0 : P[1], Should_Loop ) )
#define While²( Should_Loop ) Ch = 1, P²-- ; \
while( Ch−1 = Ch, Ch && ( Ch = *++P², Ch2 = !Ch ? 0 : P²[1], Should_Loop ) )
#define WhileBac( Should_Loop ) \
Ch = Ch2 = 0; if ( !E ) E = B + szStr( B ); P = E ; \
while ( P >= B && ( Ch2 = *P, ( Ch = --P < B ? 0 : *P ) && ( Should_Loop ) ) )
#define WhileBac²( Should_Loop ) \
Ch = Ch2 = 0; if ( !E² ) E² = B² + szStr( B² ); P² = E² ; \
while ( P² >= B² && ( Ch2 = *P², ( Ch = --P² < B² ? 0 : *P² ) && ( Should_Loop ) ) )
#define Zero( X ) memset( & X, 0, sizeof X )
// Loop though the console lines ( or any dynamic array ).
#define LoopXx( Tt, Xx ) \
Tt##P P = 0, B ; Tt##A BB = (Xx).BB, \
EE = (Xx).PP + 1, PP = BB - 1 ; \
if ( BB ) while( B = P = *++PP, PP < EE )
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
int PRE, inTag, Open⋅A, Indent ; LnT _id, _Alt, _Src, _hRef ;
// Remove trailing spaces and leading (blank) lines.
// PRE tags are honored.
rm⋅Trailing() {
LOOP { B = *Ln.PP, E = 0 ; WhileBac( Ch == L'↡' || Ch <= 32 || Ch == L'\xA0' );
if ( ++P < E ) *P = 0 ; if ( !PRE && !*B ) Ln.PP-- ; else break ; } }
Br() { int rv ; rm⋅Trailing();
if ( !PRE && !_Alt.BB && !_Src.BB && !_hRef.BB && !_id.BB ) {
// Print '↡' instead of a new line, conserve screen space.
Cat( L'↡' ); return ; }
if ( Open⋅A ) {
// Don't wrap link⋅labels.
Cat( 32 ); return ; }
if ( _hRef.BB ) {
LoopXx( Ln, _hRef )
// Print out URIs, one per line.
Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
Zero( _hRef ); }
if ( _Alt.BB ) {
LoopXx( Ln, _Alt )
// Print out "alternate (img) text", one per line.
Sh( L"\t\xA0%s", Human( B ) );
Zero( _Alt ); }
if ( _Src.BB ) {
LoopXx( Ln, _Src )
// Print out (img) URIs, one per line.
Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
Zero( _Src ); }
if ( _id.BB ) {
LoopXx( Ln, _id )
// Print (tag) IDs, one per line.
if ( Human( B ), PP == BB && **Ln.PP != L'⌗' )
// First ID, give it its own line.
Sh( L"⌗%s", B );
else
// Concatenate IDs to the previous ones.
rv = Str( _T, L", ⌗%s", B ), Cat( _T, rv );
Zero( _id ); }
// Print a blank line (Br), possibly indented.
Sh( L"%.*s", Indent*2, L" " ); }
// Up to 9,999 lines of HTML can be converted to plain⋅text.
const int mx⋅Lns⋅Decode = 9999 ; LnA BB⋅Decode, EE⋅Decode ;
LnP FileName, Wide, BB⋅Decode⋅[ mx⋅Lns⋅Decode ];
#define Init⋅Buff⋅Limited \
int iPP = Ln.PP - Ln.BB ; BB⋅Decode = BB⋅Decode⋅ ;
// Save off the console lines ( Ln ).
#define Buff⋅Limited \
{ LnA EE = Ln.PP + !!**Ln.PP, BB = Ln.PP = Ln.BB + iPP ; \
int Lines = er( mx⋅Lns⋅Decode, EE - ++BB ); \
EE⋅Decode = BB⋅Decode + Lines, \
memmove( BB⋅Decode, BB, Lines * szPtr ) ; }
// Get the next line or cleanup/exit.
#define feedMe { if ( !feedMe₍₎( B, P, PP, EE, iPP ) ) return ; }
int feedMe₍₎( LnP &B, LnP &P, LnA &PP, LnA EE, int iPP ) {
if ( Ch = *P )
// Return, no need for another (input, HTML) line.
return 1 ;
if ( PRE && !inTag )
Br();
else { LnP &T = *Ln.PP ;
// Print a space ( instead of a newline ).
if ( rv = szStr( T ), rv && T[ rv - 1 ] > 32 ) Cat( 32 ); }
if ( ++PP >= EE || ( P = B = *PP, B && MimeSep && EqN( B, *MimeSep, Sz_MimeSep ) ) ){
// Cleanup and exit.
rm⋅Trailing(); Buff⋅Limited return 0 ; }
// Set Ch and Ch2 to the first two characters.
Ch = *B, Ch2 = !Ch ? 0 : B[1]; return 1 ; }
// Indentify an HTML tag/entity.
#define Ch⋅BoTag& ( \
Ch == '&' && ( Ch2 == '#' || isAlpha( Ch2 ) ) \
|| Ch == '<' && ( Ch2 == '!' || Ch2 == '?' || isAlpha( Ch2 ) || Ch2 == '/' && isAlpha( P[2] ) ) )
// Convert (console) lines, BB to EE.
HTML⋅to⋅Text( LnA BB, LnA EE ) {
static rv ; Init⋅Buff⋅Limited LnA PP = BB - 1 ;
// Print a blank line.
Sh(0);
LnP euText, B = 0, P = 0, Alt, Src, id, hRef = 0 ;
int PlainText ;
Zero( _Alt ), Zero( _Src ), Zero( _hRef ), Zero( _id );
PlainText = PRE = Ch = Indent = Open⋅A = 0 ;
➤BoTag&: inTag = 0 ; feedMe
if ( PlainText ) {
// PlainText, no HTML.
euText = P, P += szStr( P ), Ch = 0 ; }
else {
// Parse out HTML tags/entities and text.
euText = 0 ; While( !Ch⋅BoTag& ) if ( !euText && Ch ) euText = P ; }
LnP BoTag& = P ; LnA BoTagPP = PP ; if ( !euText ) goto NonText ;
{ LnP B¹, B², P², E¹, E² ; int Sz¹, Sz², Sz¹⁺², Sz³, SP, SP², Ch, Ch2 ;
// Remove leading spaces.
if ( B¹ = P² = E¹ = *Ln.PP, Sz¹ = szStr( P² ), E¹ += Sz¹, *P² == L'↡' ) {
While²( Ch == L'↡' || Ch <= 32 );
Sz¹ -= P² - B¹, wmemmove( B¹, P², Sz¹ + 1 ); }
// Append text (euText, B²) to the current line (B¹).
if ( B² = euText, E² = BoTag&, !PRE ) {
// Consolidate spaces. PRE tags are honored.
Ch = !Sz¹ ? 0 : E¹[-1], SP = Ch <= 32, P² = B², D = B² - 1 ;
While²( P² < E² )
if ( SP² = Ch <= 32 || Ch == L'\xFeFF', !SP && SP² )
*++D = 32, SP = 1 ;
else if ( !SP² ) *++D = Ch, SP = 0 ;
// E² marks the end of the (now smaller) line, B².
E² = ++D ; }
if ( Sz² = E² - B², Sz² <= 0 ) goto NonText ;
if ( PRE ) { Cat( B², Sz² ); goto NonText ; }
Sz¹ = szStr( B¹ );
// Wrap the ( now larger ) line.
// ( ColsWin - 2, 126 columns, monospaced ).
LOOP {
if ( Sz³ = ColsWin - 2, Sz¹⁺² = Sz¹ + Sz², Sz¹⁺² <= Sz³ ) {
// Nothing to wrap, grow the line and/or break.
Cat( B², Sz² ); break ; }
E² = B² + Sz³ - Sz¹ ; WhileBac²( Iden( Ch ) );
if ( Sz³ = P² - B² + 1, Sz³ >= 22 )
// Add part of B² ( no less than 22 characters )
// to the current line.
Cat( B², Sz³ ), Sh(0), Sz¹ = 0 ;
else
// Give B² its own line.
Sh( L"%.*s", Sz¹ = Sz³, B² );
// Set B² to the not⋅yet⋅printed. ( or blank it )
Sz² -= Sz³, B² = ++P² ; } }
NonText: // Parse the HTML tag.
inTag = 1 ; if ( !Ch ) goto ➤BoTag& ;
The PRE tag ( HTML ) looks best on Google Groups because
it overrides the default: variable⋅width glyphs, auto⋅wrapped.
slrn should render HTML as plain text ( a simple task ) because
you ( Peter⋅J⋅Ross ) might want three HTML viewers:
slrn, Lynx and/or FireFox. Use the default, slrn, or an alternate.
⋅⋅⋅⋅⋅⋅⋅⋅
I'm fully aware that the code I post benefits no one but me,
no need to remind me; my "hideously obfuscated" code isn't for you.
To me, X.CPP is priceless, literally; "Virtue[work] is its own reward".
When I post code, I think about it −− both before and after −−
prompting serious changes −− rewrites −− that's why I do it.
From "Jeff-Relf.Me/X.CPP", the revised code:
typedef wchar_t *LnP ; typedef LnP *LnA ;
Sh( LnP S, ... ); // Like "printf()", for my custom console.
struct LnT { LnA BB, PP, EE ; };
// The console lines, a dynamic array of pointers.
// "*Ln.PP" is the bottom⋅most line.
LnT Ln ;
// Allocate from a temporary heap; never use "free()" or "realloc()".
LnP MallocTmp( int Sz ) { LnP M ;
if ( Sz <= 0 || ( M = (LnP)HeapAlloc( Heap, 0, Sz ), !M ) )
exit(1); // Panic ! ( never happens [ borked heap ] )
return M ; }
// Grow the bottom⋅most console line ( *Ln.PP )
// Sz² characters ( from B², as passed ).
Cat( LnP B², int Sz² ) { if ( Sz² <= 0 ) return;
LnP &B = *Ln.PP, B³ ; int Sz³ = szStr( B ) + Sz² + 1 ;
B³ = MallocTmp( Sz³ * szChr ),
Str( B³, L"%s%.*s", B, Sz², B² ), B = B³ ; }
Cat( wchar_t Ch ) { Cat( &Ch, 1 ); }
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
// Handy macros.
#define LOOP while(1)
#define Loop( N ) int J = - 1, LLL = N ; while( ++J < LLL )
// "While(0)" (for example) loops zero times and doesn't alter P,
// it it's a good way to set Ch and Ch2 to P[0] and P[1].
//
// The macro is "while( A && B )", basically.
// B sets Ch and increments P.
//
// B won't happen unless A is true. First time through the loop,
// A is true. Second pass, A is false if, initially, "*P == 0".
#define While( ↻ ) Ch = 1, P-- ; while( ( Ch−1 = Ch ) && ( Ch = *++P, Ch2 = !Ch ? 0 : P[1], ↻ ) )
#define While²( ↻ ) Ch = 1, P²-- ; while( ( Ch−1 = Ch ) && ( Ch = *++P², Ch2 = !Ch ? 0 : P²[1], ↻ ) )
#define WhileBac( ↻ ) \
Ch = Ch2 = 0; if ( !E ) E = B + szStr( B ); P = E ; \
while ( P >= B && ( Ch2 = *P, ( Ch = --P < B ? 0 : *P ) && ( ↻ ) ) )
#define WhileBac²( ↻ ) \
Ch = Ch2 = 0; if ( !E² ) E² = B² + szStr( B² ); P² = E² ; \
while ( P² >= B² && ( Ch2 = *P², ( Ch = --P² < B² ? 0 : *P² ) && ( ↻ ) ) )
#define Zero( X ) memset( & X, 0, sizeof X )
// Loop though the console lines ( or any dynamic array ).
#define LoopXx( Tt, Xx ) \
Tt##P P = 0, B ; Tt##A BB = (Xx).BB, \
EE = (Xx).PP + 1, PP = BB - 1 ; \
if ( BB ) while( B = P = *++PP, PP < EE )
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
int First⋅Line, PRE, inTag, Open⋅A, Indent ; LnT _id, _Alt, _Src, _hRef ;
// Remove trailing spaces and leading (blank) lines.
// The PRE tag is honored.
Trim() {
if ( First⋅Line ) {
// The first newline, don't trim anything.
First⋅Line = 0; return; }
LOOP { B = *Ln.PP, E = 0 ; WhileBac( Ch == L'↡' || Ch <= 32 || Ch == L'\xA0' );
if ( ++P < E ) *P = 0 ; if ( !PRE && !*B ) Ln.PP-- ; else break ; }
if ( P = B, *B == L'↡' ) {
// '↡' replaced newlines; remove them ( and whitespace )
// from the start of the (console) line.
While( Ch == L'↡' || Ch <= 32 ); wmemmove( B, P, E - P + 1 ), P = B; }
// Set Ch and Ch2 to the first two characters.
While(0); }
Br() {
if ( Trim(), !PRE && Ch != L'⌗' && !( Ch == '\t' && Ch2 == L'\xA0' )
// URI lines start with "\t\A0" and
// '⌗'( U+2317 ) starts a line of tag IDs.
//
// If the line above isn't a URI or tag IDs, print '↡'
// instead of a new line ( to conserve screen space ).
Cat( L'↡' ); return ; }
if ( Open⋅A ) {
// Don't wrap text "buttons".
Cat( 32 ); return ; }
if ( _hRef.BB ) {
LoopXx( Ln, _hRef )
// Print out URIs, one per line.
Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
Zero( _hRef ); }
if ( _Alt.BB ) {
LoopXx( Ln, _Alt )
// Print out "alternate (img) text", one per line.
Sh( L"\t\xA0%s", Human( B ) );
Zero( _Alt ); }
if ( _Src.BB ) {
LoopXx( Ln, _Src )
// Print out (img) URIs, one per line.
Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
Zero( _Src ); }
if ( _id.BB ) {
LoopXx( Ln, _id )
if ( Human( B ), **Ln.PP != L'⌗' )
// If the current (console) line is tag IDs, grow it;
// otherwise, start a new line.
Sh( L"⌗%s", B );
else
// Concatenate IDs to the previous ones.
rv = Str( _T, L", ⌗%s", B ), Cat( _T, rv );
Zero( _id ); }
// Print a blank line (Br), possibly indented.
Sh( L"%.*s", Indent*2, L" " ); }
// Up to 9,999 lines of HTML can be converted to plain⋅text.
const int mx⋅Lns⋅Decode = 9999 ; LnA BB⋅Decode, EE⋅Decode ;
LnP FileName, Wide, BB⋅Decode⋅[ mx⋅Lns⋅Decode ];
#define Init⋅Buff⋅Limited \
int iPP = Ln.PP - Ln.BB ; BB⋅Decode = BB⋅Decode⋅ ;
// Save off the console lines ( Ln ).
#define Buff⋅Limited \
{ LnA EE = Ln.PP + !!**Ln.PP, BB = Ln.PP = Ln.BB + iPP ; \
int Lines = er( mx⋅Lns⋅Decode, EE - ++BB ); \
EE⋅Decode = BB⋅Decode + Lines, \
memmove( BB⋅Decode, BB, Lines * szPtr ) ; }
// Get the next line or cleanup/exit.
#define feedMe { if ( !feedMe₍₎( B, P, PP, EE, iPP ) ) return ; }
int feedMe₍₎( LnP &B, LnP &P, LnA &PP, LnA EE, int iPP ) {
// Set Ch and Ch2 to the first two characters of the line (P).
While(0);
if ( Ch )
// Keep parsing the current line, don't fetch a new one.
return 1 ;
if ( PRE && !inTag )
Br();
else { LnP &T = *Ln.PP ;
// Print a space ( instead of a newline ).
if ( rv = szStr( T ), rv && T[ rv - 1 ] > 32 ) Cat( 32 ); }
if ( ++PP >= EE || ( P = B = *PP, B && MimeSep && EqN( B, *MimeSep, Sz_MimeSep ) ) ){
// Cleanup and exit.
Trim(); Buff⋅Limited return 0 ; }
While(0); return 1 ; }
// Indentify an HTML tag/entity.
#define Ch⋅BoTag& ( \
Ch == '&' && ( Ch2 == '#' || isAlpha( Ch2 ) ) \
|| Ch == '<' && ( Ch2 == '!' || Ch2 == '?' \
|| isAlpha( Ch2 ) || Ch2 == '/' && isAlpha( P[2] ) ) )
// Convert (console) lines; from BB to EE.
HTML⋅to⋅Text( LnA BB, LnA EE ) {
static rv ; Init⋅Buff⋅Limited LnA PP = BB - 1 ;
// Print a blank line.
Sh(0);
LnP euText, B = L"", P = B, Alt, Src, id, hRef = 0 ; int PlainText ;
Zero( _Alt ), Zero( _Src ), Zero( _hRef ), Zero( _id );
First⋅Line = 1, PlainText = PRE = Ch = Indent = Open⋅A = 0 ;
➤BoTag&: inTag = 0 ; feedMe
if ( PlainText ) {
// Don't parse anything.
euText = P, P += szStr( P ), Ch = 0 ; }
else {
// Parse out HTML tags/entities and text.
euText = 0 ; While( !Ch⋅BoTag& ) if ( !euText && Ch ) euText = P ; }
LnP BoTag& = P ; LnA BoTagPP = PP ; if ( !euText ) goto Parse⋅Tag ;
{ LnP B¹, B², P², E¹, E² ; int Sz¹, Sz², Sz¹⁺², Sz³, SP, SP², Ch, Ch2 ;
B¹ = E¹ = *Ln.PP, Sz¹ = szStr( B¹ ), E¹ += Sz¹ ;
// Add parsed⋅out text ( euText, B² ) to the current line ( B¹ ).
if ( B² = euText, E² = BoTag&, !PRE ) {
// Consolidate spaces. PRE tags are honored.
Ch = !Sz¹ ? 0 : E¹[-1], SP = Ch <= 32, P² = B², D = B² - 1 ;
While²( P² < E² )
if ( SP² = Ch <= 32 || Ch == L'\xFeFF', !SP && SP² ) *++D = 32, SP = 1 ;
else if ( !SP² ) *++D = Ch, SP = 0 ;
// E² marks the end of the (now smaller) line, B².
E² = ++D ; }
if ( Sz² = E² - B², Sz² <= 0 || Sz² == 1 && *B² <= 32 )
// Skip the printing, the line is blank.
goto Parse⋅Tag ;
if ( PRE ) { Cat( B², Sz² ); goto Parse⋅Tag ; }
// Print the parsed⋅out text, wrapped to ColsWin - 2,
// 125 columns, monospaced.
LOOP { if ( Sz³ = ColsWin - 2, Sz¹⁺² = Sz¹ + Sz², Sz¹⁺² <= Sz³ ) { Cat( B², Sz² ); break ; }
E² = B² + Sz³ - Sz¹ ; WhileBac²( Iden( Ch ) );
if ( Sz³ = P² - B² + 1, Sz³ >= 22 ) Cat( B², Sz³ ), Sh(0), Sz¹ = 0 ;
else Sh( L"%.*s", Sz¹ = Sz³, B² ); Sz² -= Sz³, B² = ++P² ; } }
Parse⋅Tag: inTag = 1 ; if ( !Ch ) goto ➤BoTag& ;