Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

slrn can't convert HTML to plain text. ( emails, websites, posts ) 

1 view
Skip to first unread message

Jeff-Relf.Me

unread,
Mar 26, 2012, 1:21:37 AM3/26/12
to
Below is some code ( Jeff-Relf.Me/X.CPP ) I wrote
to convert HTML to plain text. ( emails, websites, posts )

As simple as it is, slrn can't do it.
Forte Agent tries but fails; it's mostly about NZB files, videos. 
Mozilla Thunderbird is flawless, Win Live Mail is OK⋅ish.
 
I use one of three browsers, depending on what I want to do.
1: X.EXE ( X.CPP, like Lynx but with images )
2: FireFox ( NoScript, Jeff-Relf.Me/userContent.CSS.TXT, etc. )
3: Internet Explorer ( for animations/crap )

typedef wchar_t  *LnP ; typedef  LnP  *LnA ;
Sh( LnP S, ... ); //  Like "printf()", for my custom console.

  struct  LnT { LnA  BB, PP, EE ;  };
  //  The console lines, a dynamic array of pointers.
  LnT  Ln ;
  //  Allocate from a temporary heap; never use "free()" or "realloc()".
LnP MallocTmp( int Sz ) { LnP  M ;
  if ( Sz <= 0 || ( M = (LnP)HeapAlloc( Heap, 0, Sz ), !M ) )
    exit(1);  //  Panic ! ( never happens [ borked heap ] )

  return M ;  }

//  Grow the console line ( *Ln.PP )
//  Sz² characters ( from B², as passed ). 
Cat( LnP B², int Sz² ) { if ( Sz² <= 0 ) return;
  LnP &B = *Ln.PP, B³ ; int Sz³ = szStr( B ) + Sz² + 1 ;
  B³ = MallocTmp( Sz³ * szChr ),
  Str( B³, L"%s%.*s", B, Sz², B² ), B = B³ ;  }

Cat( wchar_t Ch ) { Cat( &Ch, 1 );  }
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
//  Handy macros.
#define  LOOP  while(1)
#define  Loop( N )  int J = - 1, LLL = N ; while( ++J < LLL )
#define  While( Should_Loop )  Ch = 1, P-- ;  \
  while( Ch−1 = Ch, Ch && ( Ch = *++P, Ch2 = !Ch ? 0 : P[1], Should_Loop ) )

#define  While²( Should_Loop )  Ch = 1, P²-- ;  \
  while( Ch−1 = Ch, Ch && ( Ch = *++P², Ch2 = !Ch ? 0 : P²[1], Should_Loop ) )

#define  WhileBac( Should_Loop )  \
  Ch = Ch2 = 0;  if ( !E )  E = B + szStr( B );  P = E ;  \
  while ( P >= B && ( Ch2 = *P, ( Ch = --P < B ? 0 : *P ) && ( Should_Loop ) ) )

#define  WhileBac²( Should_Loop )  \
  Ch = Ch2 = 0;  if ( !E² )  E² = B² + szStr( B² );  P² = E² ;  \
  while ( P² >= B² && ( Ch2 = *P², ( Ch = --P² < B² ? 0 : *P² ) && ( Should_Loop ) ) )

#define  Zero( X )  memset( & X, 0, sizeof X )

//  Loop though the console lines ( or any dynamic array ).
#define  LoopXx( Tt, Xx )  \
  Tt##P P = 0, B ; Tt##A BB = (Xx).BB,  \
  EE = (Xx).PP + 1, PP = BB - 1 ;  \
  if ( BB ) while( B = P = *++PP, PP < EE )
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
  int PRE, inTag, Open⋅A, Indent ;  LnT _id, _Alt, _Src, _hRef ;
  //  Remove trailing spaces and leading (blank) lines.
  //  PRE tags are honored.
rm⋅Trailing() {
  LOOP { B = *Ln.PP, E = 0 ;  WhileBac( Ch == L'↡' || Ch <= 32 || Ch == L'\xA0' );
    if ( ++P < E ) *P = 0 ;  if ( !PRE && !*B ) Ln.PP-- ; else break ;  } }

Br() { int rv ;  rm⋅Trailing();
  if ( !PRE && !_Alt.BB && !_Src.BB && !_hRef.BB && !_id.BB ) {
    //  Print '↡' instead of a new line, conserve screen space. 
    Cat( L'↡' ); return ;  }

  if ( Open⋅A ) {
    //  Don't wrap link⋅labels.
    Cat( 32 ); return ;  }

  if ( _hRef.BB ) {
    LoopXx( Ln, _hRef )
      //  Print out URIs, one per line.
      Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
    Zero( _hRef );  }

  if ( _Alt.BB ) {
    LoopXx( Ln, _Alt )
      //  Print out "alternate (img) text", one per line.
      Sh( L"\t\xA0%s", Human( B ) );
    Zero( _Alt );  }

  if ( _Src.BB ) {
    LoopXx( Ln, _Src )
      //  Print out (img) URIs, one per line.
      Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
    Zero( _Src );  }

  if ( _id.BB ) {
    LoopXx( Ln, _id )
      //  Print (tag) IDs, one per line.
      if ( Human( B ), PP == BB && **Ln.PP != L'⌗' )
        //  First ID, give it its own line.
        Sh( L"⌗%s", B );
      else
        //  Concatenate IDs to the previous ones.
        rv = Str( _T, L", ⌗%s", B ), Cat( _T, rv );

    Zero( _id );  }

  //  Print a blank line (Br), possibly indented.
  Sh( L"%.*s", Indent*2, L"          " );  }
  
  //  Up to 9,999 lines of HTML can be converted to plain⋅text.
  const int mx⋅Lns⋅Decode = 9999 ;  LnA  BB⋅Decode, EE⋅Decode ;
  LnP  FileName, Wide, BB⋅Decode⋅[ mx⋅Lns⋅Decode ];

  #define Init⋅Buff⋅Limited  \
    int iPP = Ln.PP - Ln.BB ;  BB⋅Decode = BB⋅Decode⋅ ;

  //  Save off the console lines ( Ln ).
  #define Buff⋅Limited  \
    { LnA EE = Ln.PP + !!**Ln.PP, BB = Ln.PP = Ln.BB + iPP ;  \
      int Lines = er( mx⋅Lns⋅Decode, EE - ++BB );  \
      EE⋅Decode = BB⋅Decode + Lines,  \
      memmove( BB⋅Decode, BB, Lines * szPtr ) ;  }

  //  Get the next line or cleanup/exit.
  #define feedMe  { if ( !feedMe₍₎( B, P, PP, EE, iPP ) ) return ;  }
int feedMe₍₎( LnP &B, LnP &P, LnA &PP, LnA EE, int iPP ) {

  if ( Ch = *P )
    //  Return, no need for another (input, HTML) line.
    return 1 ;
      
  if ( PRE && !inTag )
    Br();
  else { LnP &T = *Ln.PP ;
    // Print a space ( instead of a newline ).
    if ( rv = szStr( T ), rv && T[ rv - 1 ] > 32 ) Cat( 32 );  }

  if ( ++PP >= EE || ( P = B = *PP, B && MimeSep && EqN( B, *MimeSep, Sz_MimeSep ) ) ){
    //  Cleanup and exit.
    rm⋅Trailing(); Buff⋅Limited  return 0 ;  }

  //  Set Ch and Ch2 to the first two characters.
  Ch = *B, Ch2 = !Ch ? 0 : B[1];  return 1 ;  }

  //  Indentify an HTML tag/entity.
  #define Ch⋅BoTag& ( \
       Ch == '&' && ( Ch2 == '#' || isAlpha( Ch2 ) )  \
    || Ch == '<' && ( Ch2 == '!' || Ch2 == '?' || isAlpha( Ch2 ) || Ch2 == '/' && isAlpha( P[2] ) ) )

//  Convert (console) lines, BB to EE.
HTML⋅to⋅Text( LnA  BB, LnA  EE ) {
  static rv ;  Init⋅Buff⋅Limited  LnA PP = BB - 1 ;
  //  Print a blank line.
  Sh(0);
  LnP  euText, B = 0, P = 0, Alt, Src, id, hRef = 0 ;
  int PlainText ;
  Zero( _Alt ), Zero( _Src ), Zero( _hRef ), Zero( _id );
  PlainText = PRE = Ch = Indent = Open⋅A = 0 ;

  ➤BoTag&:  inTag = 0 ;  feedMe
  if ( PlainText ) {
    //  PlainText, no HTML.
    euText = P, P += szStr( P ), Ch = 0 ;  } 
  else {
    // Parse out HTML tags/entities and text.
    euText = 0 ; While( !Ch⋅BoTag& ) if ( !euText && Ch ) euText = P ;  }

  LnP  BoTag& = P ; LnA  BoTagPP = PP ;  if ( !euText ) goto NonText ;
  { LnP  B¹, B², P², E¹, E² ;  int Sz¹, Sz², Sz¹⁺², Sz³, SP, SP², Ch, Ch2 ;
    //  Remove leading spaces.
    if ( B¹ = P² = E¹ = *Ln.PP, Sz¹ = szStr( P² ), E¹ += Sz¹, *P² == L'↡' ) {
      While²( Ch == L'↡' || Ch <= 32 );
      Sz¹ -= P² - B¹, wmemmove( B¹, P², Sz¹ + 1 );  }

    //  Append text (euText, B²) to the current line (B¹). 
    if ( B² = euText, E² = BoTag&, !PRE ) {
      //  Consolidate spaces. PRE tags are honored.
      Ch = !Sz¹ ? 0 : E¹[-1], SP = Ch <= 32, P² = B², D = B² - 1 ;
      While²( P² < E² )
        if ( SP² = Ch <= 32 || Ch == L'\xFeFF', !SP && SP² )
          *++D = 32, SP = 1 ;
        else if ( !SP² ) *++D = Ch, SP = 0 ;

      // E² marks the end of the (now smaller) line, B².
      E² = ++D ;  }

    if ( Sz² = E² - B², Sz² <= 0 ) goto NonText ;
    if ( PRE ) { Cat( B², Sz² ); goto NonText ;  }
    Sz¹ = szStr( B¹ );
    //  Wrap the ( now larger ) line.
    //  ( ColsWin - 2, 126 columns, monospaced ). 
    LOOP {
    
      if ( Sz³ = ColsWin - 2, Sz¹⁺² = Sz¹ + Sz², Sz¹⁺² <= Sz³ ) {
        //  Nothing to wrap, grow the line and/or break.
        Cat( B², Sz² ); break ;  }

      E² = B² + Sz³ - Sz¹ ; WhileBac²( Iden( Ch ) );
      if ( Sz³ = P² - B² + 1, Sz³ >= 22 )
        //  Add part of B² ( no less than 22 characters )
        //  to the current line. 
        Cat( B², Sz³ ), Sh(0), Sz¹ = 0 ; 
      else
        //  Give B² its own line.
        Sh( L"%.*s", Sz¹ = Sz³, B² );
      
      //  Set B² to the not⋅yet⋅printed. ( or blank it )
      Sz² -= Sz³, B² = ++P² ;  } }
    
  NonText: //  Parse the HTML tag.
    inTag = 1 ;  if ( !Ch ) goto ➤BoTag& ;

Jeff-Relf.Me

unread,
Mar 27, 2012, 2:49:40 PM3/27/12
to
The PRE tag ( HTML ) looks best on Google Groups because
it overrides the default: variable⋅width glyphs, auto⋅wrapped.

slrn should render HTML as plain text ( a simple task ) because
you ( Peter⋅J⋅Ross ) might want three HTML viewers:
slrn, Lynx and/or FireFox.  Use the default, slrn, or an alternate.
⋅⋅⋅⋅⋅⋅⋅⋅
I'm fully aware that the code I post benefits no one but me,
no need to remind me; my "hideously obfuscated" code isn't for you.
To me, X.CPP is priceless, literally; "Virtue[work] is its own reward".

When I post code, I think about it −− both before and after −−
prompting serious changes −− rewrites −− that's why I do it.

From "Jeff-Relf.Me/X.CPP", the revised code:

typedef wchar_t  *LnP ; typedef  LnP  *LnA ;
Sh( LnP S, ... ); //  Like "printf()", for my custom console.

  struct  LnT { LnA  BB, PP, EE ;  };
  //  The console lines, a dynamic array of pointers.
  //  "*Ln.PP" is the bottom⋅most line.
  LnT  Ln ;
  //  Allocate from a temporary heap; never use "free()" or "realloc()".
LnP MallocTmp( int Sz ) { LnP  M ;
  if ( Sz <= 0 || ( M = (LnP)HeapAlloc( Heap, 0, Sz ), !M ) )
    exit(1);  //  Panic ! ( never happens [ borked heap ] )

  return M ;  }

//  Grow the bottom⋅most console line ( *Ln.PP )
//  Sz² characters ( from B², as passed ).
Cat( LnP B², int Sz² ) { if ( Sz² <= 0 ) return;
  LnP &B = *Ln.PP, B³ ; int Sz³ = szStr( B ) + Sz² + 1 ;
  B³ = MallocTmp( Sz³ * szChr ),
  Str( B³, L"%s%.*s", B, Sz², B² ), B = B³ ;  }

Cat( wchar_t Ch ) { Cat( &Ch, 1 );  }
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
//  Handy macros.
#define  LOOP  while(1)
#define  Loop( N )  int J = - 1, LLL = N ; while( ++J < LLL )
//  "While(0)" (for example) loops zero times and doesn't alter P,
//  it it's a good way to set Ch and Ch2 to P[0] and P[1].
//
//  The macro is "while( A && B )", basically.
//  B sets Ch and increments P.
//
//  B won't happen unless A is true.  First time through the loop,
//  A is true.  Second pass, A is false if, initially, "*P == 0".
#define  While( ↻ )  Ch = 1, P-- ; while( ( Ch−1 = Ch ) && ( Ch = *++P, Ch2 = !Ch ? 0 : P[1], ↻ ) )
#define  While²( ↻ )  Ch = 1, P²-- ; while( ( Ch−1 = Ch ) && ( Ch = *++P², Ch2 = !Ch ? 0 : P²[1], ↻ ) )

#define  WhileBac( ↻ )  \
  Ch = Ch2 = 0;  if ( !E )  E = B + szStr( B );  P = E ;  \
  while ( P >= B && ( Ch2 = *P, ( Ch = --P < B ? 0 : *P ) && ( ↻ ) ) )

#define  WhileBac²( ↻ )  \
  Ch = Ch2 = 0;  if ( !E² )  E² = B² + szStr( B² );  P² = E² ;  \
  while ( P² >= B² && ( Ch2 = *P², ( Ch = --P² < B² ? 0 : *P² ) && ( ↻ ) ) )

#define  Zero( X )  memset( & X, 0, sizeof X )

//  Loop though the console lines ( or any dynamic array ).
#define  LoopXx( Tt, Xx )  \
  Tt##P P = 0, B ; Tt##A BB = (Xx).BB,  \
  EE = (Xx).PP + 1, PP = BB - 1 ;  \
  if ( BB ) while( B = P = *++PP, PP < EE )
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅
  int First⋅Line, PRE, inTag, Open⋅A, Indent ;  LnT _id, _Alt, _Src, _hRef ;
  //  Remove trailing spaces and leading (blank) lines.
  //  The PRE tag is honored.
Trim() {
  if ( First⋅Line ) {
    //  The first newline, don't trim anything.
    First⋅Line = 0; return;  }

  LOOP { B = *Ln.PP, E = 0 ;  WhileBac( Ch == L'↡' || Ch <= 32 || Ch == L'\xA0' );
    if ( ++P < E ) *P = 0 ;  if ( !PRE && !*B ) Ln.PP-- ; else break ;  }
    
  if ( P = B, *B == L'↡' ) {
    //  '↡' replaced newlines; remove them ( and whitespace )
    //  from the start of the (console) line.
    While( Ch == L'↡' || Ch <= 32 );  wmemmove( B, P, E - P + 1 ), P = B;  }
  
  //  Set Ch and Ch2 to the first two characters.  
   While(0);  }

Br() {
  if ( Trim(), !PRE && Ch != L'⌗' && !( Ch == '\t' && Ch2 == L'\xA0' )
    //  URI lines start with "\t\A0" and
    //  '⌗'( U+2317 ) starts a line of tag IDs.
    //
    //  If the line above isn't a URI or tag IDs, print '↡'
    //  instead of a new line ( to conserve screen space ).
    Cat( L'↡' ); return ;  }

  if ( Open⋅A ) {
    //  Don't wrap text "buttons".
    Cat( 32 ); return ;  }

  if ( _hRef.BB ) {
    LoopXx( Ln, _hRef )
      //  Print out URIs, one per line.
      Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
    Zero( _hRef );  }

  if ( _Alt.BB ) {
    LoopXx( Ln, _Alt )
      //  Print out "alternate (img) text", one per line.
      Sh( L"\t\xA0%s", Human( B ) );
    Zero( _Alt );  }

  if ( _Src.BB ) {
    LoopXx( Ln, _Src )
      //  Print out (img) URIs, one per line.
      Sh( L"\t\xA0%s%s", ⁺Folder( B ), Human( B ) );
    Zero( _Src );  }

  if ( _id.BB ) {
    LoopXx( Ln, _id )
      if ( Human( B ), **Ln.PP != L'⌗' )
        //  If the current (console) line is tag IDs, grow it;
        //  otherwise, start a new line.
        Sh( L"⌗%s", B );
      else
        //  Concatenate IDs to the previous ones.
        rv = Str( _T, L", ⌗%s", B ), Cat( _T, rv );

    Zero( _id );  }

  //  Print a blank line (Br), possibly indented.
  Sh( L"%.*s", Indent*2, L"          " );  }

  //  Up to 9,999 lines of HTML can be converted to plain⋅text.
  const int mx⋅Lns⋅Decode = 9999 ;  LnA  BB⋅Decode, EE⋅Decode ;
  LnP  FileName, Wide, BB⋅Decode⋅[ mx⋅Lns⋅Decode ];

  #define Init⋅Buff⋅Limited  \
    int iPP = Ln.PP - Ln.BB ;  BB⋅Decode = BB⋅Decode⋅ ;

  //  Save off the console lines ( Ln ).
  #define Buff⋅Limited  \
    { LnA EE = Ln.PP + !!**Ln.PP, BB = Ln.PP = Ln.BB + iPP ;  \
      int Lines = er( mx⋅Lns⋅Decode, EE - ++BB );  \
      EE⋅Decode = BB⋅Decode + Lines,  \
      memmove( BB⋅Decode, BB, Lines * szPtr ) ;  }

  //  Get the next line or cleanup/exit.
  #define feedMe  { if ( !feedMe₍₎( B, P, PP, EE, iPP ) ) return ;  }
int feedMe₍₎( LnP &B, LnP &P, LnA &PP, LnA EE, int iPP ) {
  //  Set Ch and Ch2 to the first two characters of the line (P).
  While(0);
  if ( Ch )
    //  Keep parsing the current line, don't fetch a new one.
    return 1 ;

  if ( PRE && !inTag )
    Br();
  else { LnP &T = *Ln.PP ;
    // Print a space ( instead of a newline ).
    if ( rv = szStr( T ), rv && T[ rv - 1 ] > 32 ) Cat( 32 );  }

  if ( ++PP >= EE || ( P = B = *PP, B && MimeSep && EqN( B, *MimeSep, Sz_MimeSep ) ) ){
    //  Cleanup and exit.
    Trim(); Buff⋅Limited  return 0 ;  }

  While(0);  return 1 ;  }

  //  Indentify an HTML tag/entity.
  #define Ch⋅BoTag& ( \
       Ch == '&' && ( Ch2 == '#' || isAlpha( Ch2 ) )  \
    || Ch == '<' && ( Ch2 == '!' || Ch2 == '?' \
    || isAlpha( Ch2 ) || Ch2 == '/' && isAlpha( P[2] ) ) )

//  Convert (console) lines; from BB to EE.
HTML⋅to⋅Text( LnA  BB, LnA  EE ) {
  static rv ;  Init⋅Buff⋅Limited  LnA PP = BB - 1 ;
  //  Print a blank line.
  Sh(0);
  LnP  euText, B = L"", P = B, Alt, Src, id, hRef = 0 ; int PlainText ;
  Zero( _Alt ), Zero( _Src ), Zero( _hRef ), Zero( _id );
  First⋅Line = 1, PlainText = PRE = Ch = Indent = Open⋅A = 0 ;
  ➤BoTag&:  inTag = 0 ;  feedMe
  if ( PlainText ) {
    //  Don't parse anything.
    euText = P, P += szStr( P ), Ch = 0 ;  }
  else {
    // Parse out HTML tags/entities and text.
    euText = 0 ; While( !Ch⋅BoTag& ) if ( !euText && Ch ) euText = P ;  }

  LnP  BoTag& = P ; LnA  BoTagPP = PP ;  if ( !euText ) goto Parse⋅Tag ;
  { LnP  B¹, B², P², E¹, E² ;  int Sz¹, Sz², Sz¹⁺², Sz³, SP, SP², Ch, Ch2 ;
    B¹ = E¹ = *Ln.PP, Sz¹ = szStr( B¹ ), E¹ += Sz¹ ;
    //  Add parsed⋅out text ( euText, B² ) to the current line ( B¹ ).
    if ( B² = euText, E² = BoTag&, !PRE ) {
      //  Consolidate spaces. PRE tags are honored.
      Ch = !Sz¹ ? 0 : E¹[-1], SP = Ch <= 32, P² = B², D = B² - 1 ;
      While²( P² < E² )
        if ( SP² = Ch <= 32 || Ch == L'\xFeFF', !SP && SP² ) *++D = 32, SP = 1 ;
        else if ( !SP² ) *++D = Ch, SP = 0 ;

      // E² marks the end of the (now smaller) line, B².
      E² = ++D ;  }

    if ( Sz² = E² - B², Sz² <= 0 || Sz² == 1 && *B² <= 32 )
      //  Skip the printing, the line is blank.
      goto Parse⋅Tag ;

    if ( PRE ) { Cat( B², Sz² ); goto Parse⋅Tag ;  }
    //  Print the parsed⋅out text, wrapped to ColsWin - 2,
    //  125 columns, monospaced.
    LOOP { if ( Sz³ = ColsWin - 2, Sz¹⁺² = Sz¹ + Sz², Sz¹⁺² <= Sz³ ) { Cat( B², Sz² ); break ;  }
      E² = B² + Sz³ - Sz¹ ; WhileBac²( Iden( Ch ) );
      if ( Sz³ = P² - B² + 1, Sz³ >= 22 ) Cat( B², Sz³ ), Sh(0), Sz¹ = 0 ; 
      else Sh( L"%.*s", Sz¹ = Sz³, B² );   Sz² -= Sz³, B² = ++P² ;  } }
    
  Parse⋅Tag:  inTag = 1 ;  if ( !Ch ) goto ➤BoTag& ;
0 new messages