code review: APL scanner and parser

luser droog

unread,

Feb 12, 2016, 7:26:51 AM2/12/16

to

Alright, so maybe my last post wasn't interesting enough. Perhaps not
hard-core C hacksumschit enough. So here ya you. I dare you to find a
bug. I dare you to even make sense of this table-driven madness.

These modules both call functions exported from other modules. You can
find them via their header files from the repo:
https://github.com/luser-dr00g/inca/tree/master/olmec

The scanner module:

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.h

int newobj(int *s, int n, int state);
array wd(int *s, int n);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.c
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "ar.h"
#include "en.h"

#include "wd.h"

#define DIGIT (int[]){'0','1','2','3','4','5','6','7','8','9', 0}
#define DOT (int[]){'.', 0}
#define LPAR (int[]){'(', 0}
#define RPAR (int[]){')', 0}
#define QUOTE (int[]){'\'', 0}
#define SPACE (int[]){' ','\t', 0}
#define LEFT (int[]){0x2190, 0}
#define CR (int[]){0x0D, 0}

/* strchr for int strings */
int *classint(int *class, int el){
for (; *class; class++)
if (el == *class)
return class;
return NULL;
}

int *cclass[] = {0, DIGIT, DOT, QUOTE, LPAR, RPAR, SPACE, LEFT, CR};
enum state {
ini=0, //indeterminate
num=10, //numbers and vectors of numbers
dot=20, //initial dot
str=30, //initial quote
quo=40, //end or escape quote
oth=50, //identifier or other symbol
sng=60, //copula or other self-delimiting symbol
};

int wdtab[][sizeof cclass/sizeof*cclass] = {
/*char-class*/
/*none 0-9 . ' ( ) sp <- \r */
{ oth+2, num+2, dot+2, str+2, sng+2, sng+2, ini+0, sng+2, ini+0 },
{ oth+1, num+0, num+0, str+1, oth+1, oth+1, num+0, sng+1, ini+1 },
{ oth+0, num+0, oth+0, str+1, ini+1, ini+1, ini+1, sng+1, ini+1 },
{ str+0, str+0, str+0, quo+0, str+0, str+0, str+0, str+0, ini+1 },
{ oth+1, num+1, dot+1, str+0, oth+1, oth+1, ini+1, sng+1, ini+1 },
{ oth+0, num+1, oth+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
{ oth+1, num+1, dot+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
};

#define emit(a,b,c) (*p++=newobj(s+(a),(b)-a,c*10))

array wd(int *s, int n){
int a,b;
int i,j,i_,state,oldstate;
int c;
array z = array_new(n+1);
int *p=z->data;
//printf("n=%d\n",n);

state=0;
for (i=0; i<n; i++){
c=s[i];
a=0;
for (i_=1; i_<sizeof cclass/sizeof*cclass; i_++){
if (classint(cclass[i_],c)){
a=i_;
break;
}
}
b=wdtab[state][a];
oldstate=state;
state=b/10;
switch(b%10){ //encoded actions
case 0: break; // do nothing
case 1: emit(j,i,oldstate); // generate a token (and)
/*fallthrough*/
//printf("wd %p\n", (void*)getptr(p[-1]));
case 2: j=i; break; // reset start index
}
}
z->dims[0] = p-z->data;

//printf("wd %p\n", getptr(z->data[0]));
return z;
}

int newobj(int *s, int n, int state){
int t;
switch(state){
case num:
printf("number\n");
{ //TODO create number vectors
char buf[n+1];
for (int i=0; i<n; i++)
buf[i] = s[i];
buf[n] = 0;
return newdata(LITERAL,strtol(buf,NULL,10));
}
case quo:
case str:
printf("string\n");
{
array t=copy(cast(s,n));
int i;
for (i=0; i<n; i++)
t->data[i] = newdata(CHAR, t->data[i]);
return cache(ARRAY, t);
}
case ini:
case dot:
case oth:
case sng:
printf("other\n");
if (n==1){
return newdata(CHAR, *s);
} else {
array t=copy(cast(s,n));
//printf("newobj %p\n", (void*)t);
int i;
for (i=0; i<n; i++)
t->data[i] = newdata(CHAR, t->data[i]);
int x = cache(PROG, t);
//printf("newobj %d(%d,%d) %p\n", x, gettag(x), getval(x), getptr(x));
return x;
}
}
return newdata(NULLOBJ,0);
}

And the Parser module. Not all functions are filled-in.
In particular: monads and dyads and adverbs and conjunctions.
None of those modules are written yet.

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.h

int ex(array e, symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.c
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "ar.h"
#include "en.h"
#include "st.h"
#include "wd.h"

typedef struct stack { int top; int a[1];} stack; /* top==0::empty */
#define stackpush(stkp,el) ((stkp)->a[(stkp)->top++]=(el))
#define stackpop(stkp) ((stkp)->a[--((stkp)->top)])
#define stacktop(stkp) ((stkp)->a[(stkp)->top-1])

#define PREDTAB(_) \
_( ANY = 1, qa, 1 ) \
_( VAR = 2, qp, gettag(x)==PROG \
|| (gettag(x)==CHAR && getval(x)!=0x2190 \
&& getval(x)!='(' && getval(x)!=')' ) ) \
_( NOUN = 4, qn, gettag(x)==LITERAL \
|| gettag(x)==CHAR \
|| gettag(x)==ARRAY ) \
_( VRB = 8, qv, gettag(x)==VERB ) \
_( ADV = 16, qo, gettag(x)==ADVERB ) \
_( CONJ = 32, qj, 0 ) \
_( ASSN = 64, qc, gettag(x)==CHAR && getval(x) == 0x2190 ) \
_( LPAR = 128, ql, gettag(x)==CHAR && getval(x) == '(' ) \
_( RPAR = 256, qr, gettag(x)==CHAR && getval(x) == ')' ) \
/**/
#define PRED_FUNC(X,Y,...) int Y(int x){ return __VA_ARGS__; }
PREDTAB(PRED_FUNC)
#define PRED_ENT(X,Y,...) Y,
int (*q[])(int) = { PREDTAB(PRED_ENT) };
#define PRED_ENUM(X,...) X,
enum predicate { PREDTAB(PRED_ENUM)
EDGE = MARK+ASSN+LPAR,
AVN = VRB+NOUN+ADV };
/* encode predicate applications into a binary number */
int classify(int x){
int i,v,r;
for (i=0, v=1, r=0; i<sizeof q/sizeof*q; i++, v*=2)
if (q[i](x))
r |= v;
return r;
}

int monad(int f, int y, int dummy, symtab st){
}
int dyad(int x, int f, int y, symtab st){
}
int adv(int f, int g, int dummy, symtab st){
}
int conj_(int f, int g, int h, symtab st){
}
int spec(int name, int v, int dummy, symtab st){
printf("spec\n");
switch(gettag(name)){
case CHAR:{
int n = 1;
int *p = &name;
symtab tab =findsym(st,&p,&n,1);
tab->val = v;
} break;
case PROG: {
array na = getptr(name);
int n = na->dims[0];
int *p = na->data;
symtab tab = findsym(st,&p,&n,1);
tab->val = v;
} break;
}
return v;
}
int punc(int x, int dummy, int dummy2, symtab st){
return x;
}

#define PARSETAB(_) \
/* pre x y z post,2*/\
_(L0, EDGE, VRB, NOUN, ANY, monad, 3, 1,2,-1, 0,-1) \
_(L1, EDGE+AVN, VRB, VRB, NOUN, monad, -1, 2,3,-1, 1, 0) \
_(L2, EDGE+AVN, NOUN, VRB, NOUN, dyad, -1, 1,2,3, 0,-1) \
_(L3, EDGE+AVN, NOUN+VRB, ADV, ANY, adv, 3, 1,2,-1, 0,-1) \
_(L4, EDGE+AVN, NOUN+VRB, CONJ, NOUN+VRB, conj_, -1, 1,2,3, 0,-1) \
_(L5, VAR, ASSN, AVN, ANY, spec, 3, 0,2,-1, -1,-1) \
_(L6, LPAR, ANY, RPAR, ANY, punc, 3, 1,-1,-1, -1,-1) \
/**/
#define PARSETAB_PAT(label, pat1, pat2, pat3, pat4, ...) \
{pat1, pat2, pat3, pat4},
struct parsetab { int c[4]; } parsetab[] = { PARSETAB(PARSETAB_PAT) };
#define PARSETAB_INDEX(label, ...) label,
enum { PARSETAB(PARSETAB_INDEX) };
#define PARSETAB_ACTION(label,p1,p2,p3,p4, func, pre,x,y,z,post,post2) \
case label: { \
if (pre>=0) stackpush(rstk,t[pre]); \
stackpush(rstk,func(x>=0?t[x]:0,y>=0?t[y]:0,z>=0?t[z]:0,st)); \
if (post>=0) stackpush(rstk,t[post]); \
if (post2>=0) stackpush(rstk,t[post2]); \
} break;

int ex(array e, symtab st){
int n = e->dims[0];
int i,j;
int x;
stack *lstk,*rstk;
int docheck;

for (i=0; i<n; i++) { // sum symbol lengths
if (gettag(e->data[i])==PROG) {
//printf("%p\n", getptr(e->data[i]));
j+=((array)getptr(e->data[i]))->dims[0];
}
}

// allocate and prepare stacks
lstk=malloc(sizeof*lstk + (n+j+1) * sizeof*lstk->a);
lstk->top=0;
stackpush(lstk,mark);
for (i=0; i<n; i++)
stackpush(lstk,e->data[i]);
rstk=malloc(sizeof*rstk + (n+j+1) * sizeof*rstk->a);
rstk->top=0;
stackpush(rstk,null);

while(lstk->top){ //left stack not empty
x=stackpop(lstk);

if (qp(x)){ //parse and lookup name
if (rstk->top && qc(stacktop(rstk))){ //assignment: no lookup
stackpush(rstk,x);
} else {
int *s;
int n;
switch(gettag(x)){
case CHAR: {
s = &x;
n = 1;
} break;
case PROG: {
array a = getptr(x);
s = a->data;
n = a->dims[0];
} break;
}
int *p = s;
symtab tab = findsym(st,&p,&n,0);

if (tab->val == null) {
printf("error undefined\n");
return null;
}
while (n){ //while name
stackpush(lstk,newobj(s,p-s,50)); //pushback prefix
tab = findsym(st,&p,&n,0); //parse name
if (tab->val == null) {
printf("error undefined\n");
return null;
}
}
//replace name with defined value
stackpush(rstk,tab->val);
}
} else { stackpush(rstk,x); }

docheck = 1;
while (docheck){ //check rstk with patterns and reduce
docheck = 0;
if (rstk->top>=4){ //enough elements to check?
//printf("check\n");
int c[4];
for (j=0; j<4; j++)
c[j] = classify(rstk->a[rstk->top-1-j]);
printf("%d %d %d %d\n", c[0], c[1], c[2], c[3]);
for (i=0; i<sizeof parsetab/sizeof*parsetab; i++){
/*
printf("%d %d %d %d\n",
parsetab[i].c[0], parsetab[i].c[1],
parsetab[i].c[2], parsetab[i].c[3]);
printf("%d %d %d %d\n",
c[0]&parsetab[i].c[0], c[1]&parsetab[i].c[1],
c[2]&parsetab[i].c[2], c[3]&parsetab[i].c[3]);
*/
if ( c[0] & parsetab[i].c[0]
&& c[1] & parsetab[i].c[1]
&& c[2] & parsetab[i].c[2]
&& c[3] & parsetab[i].c[3] ) {
//printf("match\n");
int t[4];
t[0] = stackpop(rstk);
t[1] = stackpop(rstk);
t[2] = stackpop(rstk);
t[3] = stackpop(rstk);
switch(i){
PARSETAB(PARSETAB_ACTION)
}
docheck = 1; //stack changed: check again
break;
}
}
}
}
}
//assemble results and return
//TODO check/handle extra elements on stack
//(interpolate?, enclose and cat?)
stackpop(rstk); // mark
return stackpop(rstk);
}

josh@LAPTOP-ILO10OOF ~/inca/olmec
$

luser droog

unread,

Feb 19, 2016, 12:21:48 AM2/19/16

to

On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> Alright, so maybe my last post wasn't interesting enough. Perhaps not
> hard-core C hacksumschit enough. So here ya you. I dare you to find a
> bug. I dare you to even make sense of this table-driven madness.
>
> These modules both call functions exported from other modules. You can
> find them via their header files from the repo:
> https://github.com/luser-dr00g/inca/tree/master/olmec
>
> The scanner module:
>

<snip>

>
> And the Parser module. Not all functions are filled-in.
> In particular: monads and dyads and adverbs and conjunctions.
> None of those modules are written yet.
>

<snip>

I'll take the silence as "no obvious problems".
I feel I've come a long way from here:
https://groups.google.com/d/topic/comp.lang.c/_m0DEiFqCv0/discussion

To summarize a little of what I've learned about
scanners and parsers, while I've got you here anyway...

Scanning for tokens almost always follows a Regular
Language, eg. THIS followed by THIS followed by
(THIS or THIS). Simple alteration, repetition, and
sequences. This type of language is efficiently
processed by a finite automaton. There are many
ways of implementing a finite automaton. One popular
way is to use a regular expression package or library.
But from the computer-science angle, only for the
scanning is regex the appropriate tool. And ditch
the backrefences, too, cuz that usually screws the
pooch.

For larger grammatical problems, the appropriate
tool is constrained by the specific language, but
a push-down store of parser-states is a must. This can
be embodied in the function-call graph and handled
by the C function-call/return mechanism, or it can
involve a manually-implemented stack datatype.

Hard to keep generalizing without making shit up,
so I'll stop here. Give the repo a visit! ^^^ you
can browse the parser and scanner code, and my
suite of dope-vector array functions.

And GitHub tracks views that make a pretty graph
for me.

Tim Rentsch

unread,

Feb 21, 2016, 12:36:53 PM2/21/16

to

luser droog <luser...@gmail.com> writes:

> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
>> Alright, so maybe my last post wasn't interesting enough. Perhaps not
>> hard-core C hacksumschit enough. So here ya you. I dare you to find a
>> bug. I dare you to even make sense of this table-driven madness.
>>
>> These modules both call functions exported from other modules. You can
>> find them via their header files from the repo:
>> https://github.com/luser-dr00g/inca/tree/master/olmec
>>
>> The scanner module:
>
> <snip>
>
>> And the Parser module. Not all functions are filled-in.
>> In particular: monads and dyads and adverbs and conjunctions.
>> None of those modules are written yet.
>
> <snip>
>

> I'll take the silence as "no obvious problems". [...]

I looked at the code in the previous posting and found it too
cryptic to try to formulate some sort of response. If you want
something more specific, then, hmmm, the first few things that
come to mind are:

* I didn't see a definition for the 'array' type.

* I don't know what most of the various functions are for;
I would like to see some explanatory commentary, or more
descriptive names, or preferably both. (That also holds
for other file scope identifiers.)

* The function 'ex()' is too long and should be refactored.

Sorry I don't have anything more helpful to say; I just find
the code not accessible enough to want to dig into it more
than I have.

luser droog

unread,

Feb 21, 2016, 5:30:04 PM2/21/16

to

Hmm. I see. Thanks. I assumed it would be comprehensible
because it's significantly *less* cryptic than program it
came from. I assumed wrongly.

I'll tackle the ex() function later.
Here's the scanner code again with more extensive comments.
And the headers mentioned.

Let me describe what it does some. As shown in the "int-code
module" thread, characters are stored in 24bits of a 32bit
int, so Unicode characters are referred-to by their UCS4 code.
This decision affects the scanner code in that it must deal
with "int-strings" although the contents are expected to
primarily be restricted to the ascii domain. One special char
recognized by the scanner is the left-arrow char ← which is
used for assignment of values to variables.

The scanner is also unusual in that it treats most characters
as identifier characters, even the punctuation chars which
designate functions. These identifiers are resolved later
during symbol-lookup using prefix-matching to further scan
and parse the identifiers. For the current purpose of these
functions, it is sufficient to distinguish numbers from non-
numbers and to ensure that certain special characters like
the left-arrow and the parens are encoded as single-chars
and not parts of identifiers.

So mostly it's a big loop that runs through each character
of the int-string. The character is checked against the
character-classes table which determines the column of
big table. The current state (initially 0 or "ini") determines
the row of the big table. The value in the table encodes
a new state (the 10s value) and an action (the 1s value).
The action code adjusts the start-of-token position in
the strings and can trigger the generation of a new token.
The new token is packed into an integer handle and simply
appended to the array structure to be returned.

Does that help and/or make it more interesting?

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.h

int newobj(int *s, int n, int state);
array wd(int *s, int n);

josh@LAPTOP-ILO10OOF ~/inca/olmec

$ cat ar.h
#ifndef AR_H_
#define AR_H_
#include "../ppnarg.h"

typedef struct ar {
int type;
int rank; // number of dimensions
int *dims; // size of each dimension
int cons; // constant term of the indexing formula
int *weight; // corresponding coefficient in the indexing formula
int *data; // address of first array element
int *(*func)(struct ar *,int); // data function (if function type)
} *array;

enum type {
normal,
indirect,
function
};

int productdims(int rank, int dims[]);
array array_new_dims(int rank, int dims[]);
array array_new_function(int rank, int dims[],
int *data, int datan, int *(*func)(array,int));
int *constant(array a,int idx);
int *ret_index(array a,int idx);
void loaddimsv(int rank, int dims[], va_list ap);
array (array_new)(int rank, ...);
#define array_new(...) (array_new)(PP_NARG(__VA_ARGS__),__VA_ARGS__)
array cast_dims(int data[], int rank, int dims[]);
array (cast)(int data[], int rank, ...);
#define cast(data,...) (cast)(data,PP_NARG(__VA_ARGS__),__VA_ARGS__)
array clone(array a);
array copy(array a);

int *elema(array a, int ind[]);
int *elemv(array a, va_list ap);
int *elem(array a, ...);

int *vector_index(int ind, int dims[], int n, int vec[]);
int ravel_index(int vec[], int dims[], int n);

void transpose2(array a);
void transpose(array a, int shift);
void transposea(array a, int spec[]);
array slice(array a, int i);
array slicea(array a, int spec[]);
array slices(array a, int s[], int f[]);
array extend(array a, int extra);

array cat(array x, array y);
array iota(int n);
array scalar(int n);
array (vector)(int n, ...);
#define vector(...) (vector)(PP_NARG(__VA_ARGS__),__VA_ARGS__)

#endif

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat en.h
typedef struct datum {
unsigned int val:24;
unsigned int tag:8;
} datum;

typedef union integer {
datum data;
int32_t int32;
} integer;

enum tag {
LITERAL, /* val is a 24-bit 2's comp integer */
NUMBER, /* val is an index in the number table */
CHAR, /* val is a 21-bit Unicode code point padded with zeros */
PCHAR, /* val is a an executable char */
PROG, /* val is an (index to an) executable code fragment (ARRAY of PCHAR)*/
ARRAY, /* val is a(n index to a) boxed array */
SYMTAB, /* val is a(n index to a) symbol table */
NULLOBJ, /* val is irrelevant (s.b. 0) */
VERB, /* val is a(n index to a) verb object */
ADVERB, /* val is a(n index to a) verb object */
MARKOBJ, /* val is irrelevant (s.b. 0) */
LPAROBJ,
RPAROBJ,
};

extern int null;
extern int mark;

void init_en();

int gettag(int d);
int getval(int d);
int newdata(int tag, int val);

int cache(int tag, void *ptr);
void *getptr(int d);
int getfill(int d);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.h

int newobj(int *s, int n, int state);
array wd(int *s, int n);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.c
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "ar.h" // array type
#include "en.h" // atomic encoding

#include "wd.h"

// character classes

#define DIGIT (int[]){'0','1','2','3','4','5','6','7','8','9', 0}
#define DOT (int[]){'.', 0}
#define LPAR (int[]){'(', 0}
#define RPAR (int[]){')', 0}
#define QUOTE (int[]){'\'', 0}
#define SPACE (int[]){' ','\t', 0}
#define LEFT (int[]){0x2190, 0}
#define CR (int[]){0x0D, 0}

/* strchr for int strings */
int *classint(int *class, int el){
for (; *class; class++)
if (el == *class)
return class;
return NULL;
}

// state-machine table
// input char is compared against classes 1..N-1 and no match
// selects column 0 (marked none in wdtab)
// wdtab[state][class] contains a new state and action code
// action=0 :: do nothing
// 1 :: emit previous token and reset start position
// 2 :: (re)set start position
// 3 :: emit previous token without prev char and set start to prev char

int *cclass[] = {0, DIGIT, DOT, QUOTE, LPAR, RPAR, SPACE, LEFT, CR};
enum state {
ini=0, //indeterminate

dot=10, //initial dot .
num=20, //integer 0
dit=30, //medial dot 0.
fra=40, //fraction 0.0
str=50, //initial quote '
quo=60, //end or escape quote 'aaa''
oth=70, //identifier or other symbol a+
dut=80, //trailing dot +.
sng=90, //copula or other self-delimiting symbol ()

};

int wdtab[][sizeof cclass/sizeof*cclass] = {
/*char-class*/

/*state none 0-9 . ' ( ) sp <- \r */
/*0*/ { oth+2, num+2, dot+2, str+2, sng+2, sng+2, ini+0, sng+2, ini+0 },
/*10*/{ oth+0, fra+0, oth+0, str+1, ini+1, ini+1, ini+1, sng+1, ini+1 },
/*20*/{ oth+1, num+0, dit+0, str+1, oth+1, oth+1, ini+1, sng+1, ini+1 },
/*30*/{ oth+0, num+1, dut+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*40*/{ oth+1, fra+0, dot+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*50*/{ str+0, str+0, str+0, quo+0, str+0, str+0, str+0, str+0, ini+1 },
/*60*/{ oth+1, num+1, dot+1, str+0, oth+1, oth+1, ini+1, sng+1, ini+1 },
/*70*/{ oth+0, num+1, dut+0, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*80*/{ oth+0, num+3, dut+0, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*90*/{ oth+1, num+1, dot+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
};

#define emit(start,end,state) (*p++=newobj(s+(start),(end)-(start),(state)*10))

// scan up to n chars from s and produce 1D array of encoded expression

array wd(int *s, int n){
int a,b;

int i,j,i_,state=0,oldstate=0,oldoldstate=0;
int c;
array z = array_new(n+1); // create an array of maximum possible size
int *p=z->data; // p pointer appends data to array

//printf("n=%d\n",n);

state=0;
for (i=0; i<n; i++){

printf("i= %d, state = %d, p-s = %d\n", i, state*10, (int)(p-z->data));

c=s[i]; // classify c according to cclass table

a=0;
for (i_=1; i_<sizeof cclass/sizeof*cclass; i_++){
if (classint(cclass[i_],c)){
a=i_;
break;
}
}

b=wdtab[state][a]; // lookup new state from wdtab
oldoldstate=oldstate;
oldstate=state;
state=b/10;

switch(b%10){ // perform encoded actions

case 0: break; // do nothing
case 1: emit(j,i,oldstate); // generate a token

j=i;
break;
case 2: j=i; // just reset start index
break;
case 3: emit(j,i-1,oldoldstate);
j=i-1;
break;
}
if (p-z->data>1){ // collapse adjacent numbers into number vectors
if(gettag(p[-1])==LITERAL && gettag(p[-2])==LITERAL)
(--p)[-1] = cache(ARRAY, vector(p[-1],p[0]));
if(gettag(p[-1])==LITERAL && gettag(p[-2])==ARRAY)
(--p)[-1] = cache(ARRAY, cat(getptr(p[-1]), vector(p[0])));
}
}
z->dims[0] = p-z->data; // set actual encoded length

//printf("wd %p\n", getptr(z->data[0]));
return z;
}

// construct a new object fron n chars starting at s
// select type according to state argument which should be the
// final state the machine was in before the start of the next
// token was discovered.
int newobj(int *s, int n, int state){
switch(state){
case num: // interpret a numeric string
case dit:
/*case fra:*/
printf("number n=%d\n", n);
{
char buf[n+1];
char *p;
for (int i=0; i<n; i++) // make nul-terminated copy

buf[i] = s[i];
buf[n] = 0;

int t = newdata(LITERAL, strtol(buf,&p,10));
if (*p) {
array z = scalar(t);
while(*p) {
int u = newdata(LITERAL, strtol(p,&p,10));
z = cat(z, scalar(u));
}
t = cache(ARRAY, z);
}
return t;
}
case quo: // interpret a character string TODO elim escape quotes
case str:
printf("string n=%d\n", n);

{
array t=copy(cast(s,n));
int i;
for (i=0; i<n; i++)
t->data[i] = newdata(CHAR, t->data[i]);
return cache(ARRAY, t);
}

case ini: // anything else is an executable character or string
case dot:
case dut:
case oth:
case sng:
printf("other n=%d\n", n);
if (n==1){
if (*s == '(') return newdata(LPAROBJ, 0); // special paren objects
if (*s == ')') return newdata(RPAROBJ, 0);
return newdata(PCHAR, *s);

} else {
array t=copy(cast(s,n));
//printf("newobj %p\n", (void*)t);
int i;
for (i=0; i<n; i++)

t->data[i] = newdata(PCHAR, t->data[i]);

Ian Collins

unread,

Feb 22, 2016, 12:57:53 AM2/22/16

to

luser droog wrote:

Why is a function that does this:

> // scan up to n chars from s and produce 1D array of encoded expression

Called this

> array wd(int *s, int n){

?
--
Ian Collins

luser droog

unread,

Feb 22, 2016, 1:51:29 AM2/22/16

to

That's the name from the original interpreter I studied,
called the incunabulum. The name I believe is short for
'words', as the scanning process is often called 'word
formation' in the apl literature.

I should probably think of something better. But I can't
call it 'scan' because another function is called that.

luser droog

unread,

Feb 25, 2016, 12:19:24 AM2/25/16

to

On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:

I've added comments and done some refactoring of the parsing
functions. Some further background links might be useful to
anyone attempting to deeply understand this code. I make
extensive use of X-Macros about which I've written on SO:
http://stackoverflow.com/questions/6635851/real-world-use-of-x-macros
The verb data structure is exactly the same as in J, and these
two resources describe it in more detail than I could do here:
http://archive.vector.org.uk/trad/v094/hui094_85.pdf (scanned)
And an overview of parsing and scanning in J:
http://sblom.github.io/openj-core/iojSent.htm
I've generally tried to copy the structures and algorithms
described there, but translating to the simplest possible
(but no simpler) representation in C.

Some differences exist due to different choices I've made for
the language, such as the exact mechanism of looking up
defined identifiers.

I'd be happy to explain what any part is trying to do.
I've tried to anticipate these spots and provide comments
already. I'm mostly very happy with this code. It represents
almost 3 years of writing, re-writing, reading, re-writing,
re-reading, re-writing, more reading, more re-writing.
But I'm running out of things to take away.

Any comments on this parsing(/executing) code?

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat st.h
/* symbol table */

typedef struct st {
int key;
int val;
int n;
struct st **tab /*[n]*/ ;
} *symtab;

symtab makesymtab(int n);
/* mode=0: prefix match
mode=1: defining search */
symtab findsym(symtab st, int **spp, int *n, int mode);
void def(symtab st, int name, int v);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat vb.h

#define VERBTAB(_) \
/*base monad dyad f g h mr lr rr*/ \
_('+', vid, vplus, 0, 0, 0, 0, 0, 0 ) \
_('-', vneg, vminus, 0, 0, 0, 0, 0, 0 ) \
_('*', vsignum, vtimes, 0, 0, 0, 0, 0, 0 ) \
_(0x2374/*rho*/, vshapeof, vreshape, 0, 0, 0, 0, 0, 0 ) \
_('#', vtally, 0, 0, 0, 0, 0, 0, 0 ) \
_(0x2373/*iota*/, viota, 0, 0, 0, 0, 0, 0, 0 ) \
_('{', vhead, vtake, 0, 0, 0, 0, 1, 0 ) \
_(',', vravel, vcat, 0, 0, 0, 0, 0, 0 ) \
_(';', vraze, vlink, 0, 0, 0, 0, 0, 0 ) \
/**/
typedef struct verb {
int id;
int (*monad)(int,struct verb*);
int (*dyad)(int,int,struct verb*);
int f,g,h; /* operator arguments */
int mr,lr,rr; /* monadic,left,right rank*/
} *verb;

void init_vb(symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.h

// predicate table contains predicate functions
// and associated enum values

#define PREDTAB(_) \
_( ANY = 1, qa, 1 ) \
_( VAR = 2, qp, gettag(x)==PROG \

|| (gettag(x)==PCHAR && getval(x)!=0x2190 /*leftarrow*/ ) ) \

_( NOUN = 4, qn, gettag(x)==LITERAL \
|| gettag(x)==CHAR \
|| gettag(x)==ARRAY ) \
_( VRB = 8, qv, gettag(x)==VERB ) \

_( DEX = 16, qx, 0 ) /*dextri-monadic verb*/\
_( ADV = 32, qo, gettag(x)==ADVERB && ((verb)getptr(x))->monad ) \
_( LEV = 64, qe, 0 ) /*sinister adverb*/\
_( CONJ = 128, qj, gettag(x)==ADVERB && ((verb)getptr(x))->dyad ) \
_( MARK = 256, qm, gettag(x)==MARKOBJ ) \
_( ASSN = 512, qc, gettag(x)==PCHAR && getval(x) == 0x2190 ) \
_( LPAR = 1024, ql, gettag(x)==LPAROBJ ) \
_( RPAR = 2048, qr, gettag(x)==RPAROBJ ) \
_( NUL = 4096, qu, gettag(x)==NULLOBJ ) \
/**/

// declare predicate functions
#define PRED_DECL(X,Y,...) int Y(int);
PREDTAB(PRED_DECL)

// declare predicate enums and composed patterns

#define PRED_ENUM(X,...) X,
enum predicate { PREDTAB(PRED_ENUM)
EDGE = MARK+ASSN+LPAR,
AVN = VRB+NOUN+ADV };

// execute an expression e with environment st
int execute_expression(array e, symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.c
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "ar.h"
#include "en.h"
#include "st.h"
#include "wd.h"

#include "vb.h"
#include "ex.h"

int parse_and_lookup_name(stack *lstk, stack *rstk, int x, symtab st);

/* stack type
the size is generously pre-calculated
and so we can skip all bounds checking.
stkp->top is the size (index of next empty slot for next push)
stkp->top-1 is the topmost element
*/

typedef struct stack { int top; int a[1];} stack; /* top==0::empty */
#define stackpush(stkp,el) ((stkp)->a[(stkp)->top++]=(el))
#define stackpop(stkp) ((stkp)->a[--((stkp)->top)])
#define stacktop(stkp) ((stkp)->a[(stkp)->top-1])

/* predicate functions are instantiated according to the table
defined in the header.
the q[] function array is used by classify to apply all
predicate functions yielding a sum of all applicable codes
defined in the table. Specific qualities or combinations
may then be determined easily by masking.

*/
#define PRED_FUNC(X,Y,...) int Y(int x){ return __VA_ARGS__; }
PREDTAB(PRED_FUNC)
#define PRED_ENT(X,Y,...) Y,
int (*q[])(int) = { PREDTAB(PRED_ENT) };

/* encode predicate applications into a binary number

which can be compared to a pattern with a mask */

int classify(int x){
int i,v,r;
for (i=0, v=1, r=0; i<sizeof q/sizeof*q; i++, v*=2)
if (q[i](x))
r |= v;
return r;
}

/* Parser Actions,
each function is called with x y z parameters defined in PARSETAB
*/

int monad(int f, int y, int dummy, symtab st){

printf("monad\n");
verb v = getptr(f);
return v->monad(y,v);

}

int dyad(int x, int f, int y, symtab st){

printf("dyad\n");
verb v = getptr(f);
return v->dyad(x,y,v);

}

int adv(int f, int g, int dummy, symtab st){

printf("adverb\n");
verb v = getptr(g);
return v->monad(f,v);

}

int conj_(int f, int g, int h, symtab st){

printf("conj\n");
verb v = getptr(g);
return v->dyad(f,h,v);
}

//specification

int spec(int name, int v, int dummy, symtab st){

def(st, name, v);

return v;
}

int punc(int x, int dummy, int dummy2, symtab st){
return x;
}

// the Parse Table defines the grammar of the language
// At each stack move, the top four elements of the right stack
// are checked against each of these patterns. A matching pattern
// returns element t[pre] from the pattern area to the right stack
// then calls func(t[x],t[y],t[z]) and pushes the result to the
// right stack, then pushes t[post] and t[post2].
#define PARSETAB(_) \
/* p[0] p[1] p[2] p[3] func pre x y z post,2*/\

_(L0, EDGE, VRB, NOUN, ANY, monad, 3, 1,2,-1, 0,-1) \
_(L1, EDGE+AVN, VRB, VRB, NOUN, monad, -1, 2,3,-1, 1, 0) \

_(L2, ANY, NOUN, DEX, ANY, monad, 3, 2,1,-1, 0,-1) \
_(L3, EDGE+AVN, NOUN, VRB, NOUN, dyad, -1, 1,2,3, 0,-1) \
_(L4, EDGE+AVN, NOUN+VRB, ADV, ANY, adv, 3, 1,2,-1, 0,-1) \
_(L5, ANY, LEV, NOUN+VRB, ANY, adv, 3, 2,1,-1, 0,-1) \
_(L6, EDGE+AVN, NOUN+VRB, CONJ, NOUN+VRB, conj_, -1, 1,2,3, 0,-1) \
_(L7, VAR, ASSN, AVN, ANY, spec, 3, 0,2,-1, -1,-1) \
_(L8, LPAR, ANY, RPAR, ANY, punc, 3, 1,-1,-1, -1,-1) \
_(L9, MARK, ANY, RPAR, ANY, punc, 3, 1,-1,-1, 0,-1) \
_(L10,ANY, LPAR, ANY, NUL, punc, 3, 2,-1,-1, 0,-1) \
/**/

// create parsetab array of structs containing the patterns

#define PARSETAB_PAT(label, pat1, pat2, pat3, pat4, ...) \
{pat1, pat2, pat3, pat4},
struct parsetab { int c[4]; } parsetab[] = { PARSETAB(PARSETAB_PAT) };

// generate labels to coordinate table and execution

#define PARSETAB_INDEX(label, ...) label,
enum { PARSETAB(PARSETAB_INDEX) };

// perform the grammar production, transforming the stack

#define PARSETAB_ACTION(label,p1,p2,p3,p4, func, pre,x,y,z,post,post2) \
case label: { \
if (pre>=0) stackpush(rstk,t[pre]); \
stackpush(rstk,func(x>=0?t[x]:0,y>=0?t[y]:0,z>=0?t[z]:0,st)); \
if (post>=0) stackpush(rstk,t[post]); \
if (post2>=0) stackpush(rstk,t[post2]); \
} break;

// execute expression e using environment st and yield result
int execute_expression(array e, symtab st){

int n = e->dims[0];
int i,j;
int x;
stack *lstk,*rstk;
int docheck;

for (i=0; i<n; i++) { // sum symbol lengths
if (gettag(e->data[i])==PROG) {
//printf("%p\n", getptr(e->data[i]));
j+=((array)getptr(e->data[i]))->dims[0];
}
}

// allocate and prepare stacks
lstk=malloc(sizeof*lstk + (n+j+1) * sizeof*lstk->a);
lstk->top=0;
stackpush(lstk,mark);
for (i=0; i<n; i++)
stackpush(lstk,e->data[i]);
rstk=malloc(sizeof*rstk + (n+j+1) * sizeof*rstk->a);
rstk->top=0;
stackpush(rstk,null);

while(lstk->top){ //left stack not empty
x=stackpop(lstk);

printf("->%d(%d,%x)\n", x, gettag(x), getval(x));

if (qp(x)){ // x is a pronoun?
if (parse_and_lookup_name(lstk, rstk, x, st) == null)
return null;

} else {
stackpush(rstk,x);
}

docheck = 1;
while (docheck){ //check rstk with patterns and reduce
docheck = 0;
if (rstk->top>=4){ //enough elements to check?

int c[4];

for (j=0; j<4; j++)
c[j] = classify(rstk->a[rstk->top-1-j]);

//printf("%d %d %d %d\n", c[0], c[1], c[2], c[3]);

for (i=0; i<sizeof parsetab/sizeof*parsetab; i++){

if ( c[0] & parsetab[i].c[0]
&& c[1] & parsetab[i].c[1]
&& c[2] & parsetab[i].c[2]
&& c[3] & parsetab[i].c[3] ) {

int t[4];

printf("match %d\n", i);

t[0] = stackpop(rstk);
t[1] = stackpop(rstk);
t[2] = stackpop(rstk);
t[3] = stackpop(rstk);
switch(i){
PARSETAB(PARSETAB_ACTION)
}
docheck = 1; //stack changed: check again
break;
}
}
}
}
}

//assemble results and return
//TODO check/handle extra elements on stack
//(interpolate?, enclose and cat?)
stackpop(rstk); // mark
return stackpop(rstk);
}

// lookup name in environment unless to the left of assignment
// if the full name is not found, but a defined prefix is found,
// push the prefix back to the left stack and continue lookup
// with remainder. push value to right stack.
int parse_and_lookup_name(stack *lstk, stack *rstk, int x, symtab st){

if (rstk->top && qc(stacktop(rstk))){ //assignment: no lookup
stackpush(rstk,x);
} else {

printf("lookup\n");

int *s;
int n;
switch(gettag(x)){

case PCHAR: { // single char

s = &x;
n = 1;
} break;

case PROG: { // longer name

array a = getptr(x);
s = a->data;
n = a->dims[0];
} break;
}
int *p = s;
symtab tab = findsym(st,&p,&n,0);

if (tab->val == null) {

printf("error undefined prefix\n");

return null;
}
while (n){ //while name

printf("%d\n", n);
//stackpush(lstk,newobj(s,p-s,70)); //pushback prefix name
stackpush(lstk,tab->val); //pushback value
s = p;
tab = findsym(st,&p,&n,0); //lookup remaining name

if (tab->val == null) {

printf("error undefined internal\n");

return null;
}
}
//replace name with defined value

printf("==%d(%d,%x)\n", tab->val, gettag(tab->val), getval(tab->val));
stackpush(rstk,tab->val);
}
return 0;
}

josh@LAPTOP-ILO10OOF ~/inca/olmec
$

luser droog

unread,

Feb 25, 2016, 1:37:47 AM2/25/16

to

On Wednesday, February 24, 2016 at 11:19:24 PM UTC-6, luser droog wrote:
> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
> > luser droog <luser...@gmail.com> writes:
> >
> > > On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> > >> Alright, so maybe my last post wasn't interesting enough. Perhaps not
> > >> hard-core C hacksumschit enough. So here ya you. I dare you to find a
> > >> bug. I dare you to even make sense of this table-driven madness.
> > >>
> > >> These modules both call functions exported from other modules. You can
> > >> find them via their header files from the repo:
> > >> https://github.com/luser-dr00g/inca/tree/master/olmec
> > >>
> > >> The scanner module:
> > >
> > > <snip>
> > >
> > >> And the Parser module. Not all functions are filled-in.

<snip>

> I've added comments and done some refactoring of the parsing
> functions. Some further background links might be useful to
> anyone attempting to deeply understand this code. I make
> extensive use of X-Macros about which I've written on SO:
> http://stackoverflow.com/questions/6635851/real-world-use-of-x-macros
> The verb data structure is exactly the same as in J, and these
> two resources describe it in more detail than I could do here:
> http://archive.vector.org.uk/trad/v094/hui094_85.pdf (scanned)
> And an overview of parsing and scanning in J:
> http://sblom.github.io/openj-core/iojSent.htm
> I've generally tried to copy the structures and algorithms
> described there, but translating to the simplest possible
> (but no simpler) representation in C.
>

Here's an annotated debugging output of the execution of
a simple statement.

$ ./olmec
2+2 <- user input
0032 002b 0032 000d 0000 <- input bytes
number <- scanner analysis
other
number

1 <- rank of resulting array
3 <- length of dim[0]
2(0,2) 16777259(1,2b) 2(0,2) <- encoded values
0x0 <- a hex zero for some reason
->2(0,2) <- push 2 to right stack
->16777259(1,2b) <- push + to right stack
lookup
==117440512(7,0) <- lookup + yielding verb plus
->2(0,2) <- push 2 to right stack
->150994944(9,0) <- push start marker
match 3 <- grammar production matched
dyad <- handler function
4(0,4) <- result

The display of results part needs a lot of work.

Rosario19

unread,

Feb 25, 2016, 2:01:33 AM2/25/16

to

On Wed, 24 Feb 2016 22:37:33 -0800 (PST), luser droog wrote:
> $ ./olmec
> 2+2 <- user input
> 0032 002b 0032 000d 0000 <- input bytes
> number <- scanner analysis
> other
> number
>
> 1 <- rank of resulting array
> 3 <- length of dim[0]
> 2(0,2) 16777259(1,2b) 2(0,2) <- encoded values
> 0x0 <- a hex zero for some reason
> ->2(0,2) <- push 2 to right stack
> ->16777259(1,2b) <- push + to right stack
> lookup
> ==117440512(7,0) <- lookup + yielding verb plus
> ->2(0,2) <- push 2 to right stack
> ->150994944(9,0) <- push start marker
> match 3 <- grammar production matched
> dyad <- handler function
> 4(0,4) <- result

I like "luser droog" way of write/ think
even if i not agree in use too much macro for debug purpose
but the stack has a max limit or the prog if goes out of stack seg
fault?

but where is the advantage of using for one language, one interpreter
and not a compiler?

luser droog

unread,

Feb 25, 2016, 2:22:26 AM2/25/16

to

Still too much macro? I suppose that reaction is somewhat par for the
course. But it enables me to write table-driven code with simple
functions and powerful data-structures. I can never go back.
But I'd love to help others along. Go on, give in to your anger....

> but where is the advantage of using for one language, one interpreter
> and not a compiler?

Play! An interactive interpreter gives immediate feedback.
You hit enter and you get unicorns or nasal demons. It's all up to
the shit you type! Fuckin' awesome is what!

There are potential places where I might gain performance by
'compiling' for some simple main-line code, but it then loses
flexibility. One of the uses I envision for the prefix-matching
ability of identifiers is idiom-recognition. A sequence of function
symbols may be redefined to do something else, conceivably a faster
way of doing that combination. I keep expecting cries of horror
when I mention this. An expression like

a+b

is a single identifier. It's a single token. No bs. It gets
shoveled off to the symbol-table which does a prefix match.
If it finds an 'a' defined, that gets split off and it looks
again for '+b', presumably then finding '+' and then searching
again for 'b'.

So, that part's actually really really weird. Produces normal
results for normal kinda stuff, but enables a program a great
deal of ability to redefine its appearance as well as behavior.

It's intended as golfing language btw. Another side-effect of
all this is identifiers cannot contain digits. No way. You
cannot define 'line1' and 'line2' as variables. It's a dang
array language, see. If you want to number several things,
put them into an array. Then you can index it with numbers.

Once I add indexing, that is. More decisions to make.

Tim Rentsch

unread,

Feb 26, 2016, 11:07:29 AM2/26/16

to

> [several paragraphs of explanation]
>
> Does that help and/or make it more interesting? [snip code]

Yes, it does both. Your comments were enough to get me over the
hump and dig into it. I have omitted quoting your code to try to
keep things short, hopefully that won't be a problem. In the
meantime it looks like you've posted an update but I haven't yet
looked at that in any detail, so there may be some overlap with
what I have to say below. Many or most of my comments are more
about presentation than substance but I hope still useful and of
interest. And there are at least a few on substance. Forgive me
if the remarks below are in no particular order.

First, about commenting. There is a lot of useful information in
the comments but they are spread through the program kind of like
a hologram, in a way that makes it hard to form an overall
picture. It would be better to have a single large comment at
the start (of wd.c, for example) that explains everything that's
going on, and leave off the local comments. The single initial
comment should explain enough so that any further comments are
unnecessary except in rare and isolated cases.

Second, order and organization. Focusing on wd.c, the order and
organization could be improved. The main function wd() should be
first. (Incidentally both the name of the file and the name of
this function could be better, but I think you have already
mentioned that.) The table wdtab[] and associated macros should
be last (with appropriate earlier declaration). I favor having a
separate "private" include file (for each .c file) for such
things, but putting a few forward declarations at the start of
wd.c shouldn't be too bad here. The definition for the enum type
symbols can't be at the end but I think it can be put after the
wd() function, and if so then should be (and moving it into a
separate "private" header, eg, "wd-private.h", is even better).

Third, variable and type names. The choice of variable names
seems pretty ad hoc and patternless. I don't mind one letter
(or sometimes two letter) variable names but it helps to have
some sort of discernible pattern, and I didn't see any. More
important though is type names, or more precisely the lack of
type names. It seems like everything is just 'int'! It would
help a lot to assign names to the different kinds of things the
code operates on, to see what goes where. I have some examples
in the code I'm going to give below.

I think that's all the broad comments I have. Now let's look at
some specifics.

The two large functions are both too long. I don't mean they
exceed some absolute limit but have too much detail to be easily
understandable as a single function. For newobj() there is a
straightforward refactoring which to me is a big improvement:

int
newobj( int *s, int n, int state ){
switch( state ){
case num:
case dit: return new_numeric( s, n, state );

case quo:
case str: return new_string( s, n, state );

case ini:

case dot:
case dut:
case oth:

case sng: return new_executable( s, n, state );

default: return newdata( NULLOBJ, 0 );
}
}

with new subsidiary functions new_numeric(), etc. Putting the
detailed functionality in subsidiary functions also lets us see
what that is in each case, by choosing a good name for those.

Now we turn to the wd() function. Let me make a few preliminary
observations, then show a plausible refactoring, then after going
through the refactoring make some additional comments.

The emit() macro complicates things more than it helps. It is
called only twice, and easily could be just written out in the
two places it is called. The definition refers to 'p' which is
not a macro parameter, which is generally considered a bad idea.
Also I think it may conceal a bug, a confusion between a 'state'
quantity and a 'state' quantity times 10. Even if this isn't a
bug the apparent discrepancy should be pointed out and clarified.
I discovered this bug (or potential bug) only after expanding the
macro call by hand. (Added while composing this response: I see
now that muliplying by 10 is needed; to me this says how these
two quantities are encoded is confusing and needs some sort of
fixup.)

The main loop updates state, oldstate, and oldoldstate part way
through the loop. This makes the invariants more complicated,
and also means three variables are used where (I think) only
two are needed. My refactoring shows a way to address this.

I mentioned a need for additional type names. The example below
shows some cases that I think should be changed by not all of
them. Also, although I have supplied enough scaffolding to be
able to compile the code, obviously I haven't tried running it,
so I'm not sure if I may have introduced some bugs in the
process. If I haven't, all well and good. If I have, that is an
indication that some subtlety in the original code needs to be
pointed out and clarified.

Okay, here comes the revised wd() function:

array
wd( int *s, int n ){
array result = array_new( n + 1 );
int *p = result->data, *const p1 = p+1;
State ss, st; /* ss is last state, st is current state */
StateAndActionCode cc;
int i, j;

for( i = j = 0, ss = st = 0; i < n; i++, ss = st, st = cc/10 ){
cc = code_lookup( s[i], st );

switch( cc % 10 ){
case 0: /* do nothing */
break;

case 1: *p++ = newobj( s+j, i-j, st*10 );
j = i;
break;

case 2: j = i;
break;

case 3: *p++ = newobj( s+j, i-1-j, ss*10 );
j = i - 1;
break;
}

if( p > p1 ) p = collapse_adjacent_numbers_if_needed( p );
}

result->dims[0] = p - result->data; // set actual encoded length
return result;
}

To me this writing is a big improvement on the original. I can
see what's going on without getting bogged down in low level
details, which are tucked away in two new subsidiary functions.
There are only two states referenced at any given time, with the
change of those having been made part of the loop invariant. Two
new type names have been introduced, showing what sorts of
quantities occupy the different variables involved; these names
may be off compared to how the words are used in other parts of
the program, but if so that is an indication that this point
needs clarification. A very minor point: I introduced the
variable 'p1' to hold the unvarying value against which 'p'
is compared to see if collapsing may need to be done.

Now a few notes on potential further improvements. One, the need
to multiply by 10 in the calls to newobj() says to me that
somewhere a poor design decision was made. I believe the code is
not wrong, but it isn't obviously right either, and that raises a
red flag. Two, the conversions of the StateAndActionCode
variable 'cc' are done directly using divide and remainder
operations; it would be better if these were abstracted as
operations, eg, state_from() and action_from() functions (or
macros). Three, the setting of 'cc' is probably better if the
line

cc = code_lookup( s[i], st );

were changed to

cc = wdtab[ st ][ character_class( s[i] ) ];

simplifying the subsidiary function needed, and showing more
directly where the value for 'cc' comes from.

Now just a few more comments on the original code.

First, you notice the function above initializes both 'i' and 'j'
before starting the loop. I did this because in the original
code it looked like it might be possible to use 'j' before it has
been given a value. (I found this myself, but gcc later verified
it.) I suspect that in fact this situation cannot occur, by
virtue of what values are present in wdtab[][]. However, one way
or another that subtlety should be addressed.

Second, the original code contains these lines:

(--p)[-1] = cache(ARRAY, vector(p[-1],p[0]));

and

(--p)[-1] = cache(ARRAY, cat(getptr(p[-1]), vector(p[0])));

Both of these are undefined behavior, because the variable 'p' is
both updated and read without any intervening sequence point.
What I think is meant is

--p, p[-1] = cache(ARRAY, vector(p[-1],p[0]));

and

--p, p[-1] = cache(ARRAY, cat(getptr(p[-1]), vector(p[0])));

but whatever it is that is meant should be expressed in a way
that C semantics can't screw it up.

Third, and last but not least, how the character class lookup is
done seems painfully slow. It should be easy to do this using a
hash-table-like technique, especially if there is a guarantee
that the int/character values use an ASCII/Unicode encoding. For
example, if that guarantee applies, then the character_class()
function mentioned above should be reducible to something like

return c < 64 ? values[c] : c == 0x2910 ? LEFT_CODE : 0;

where 'values[]' is a 64 element array of unsigned char values,
initialized using designated initializers, eg,

static unsigned char
values[64] = {
['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
[ '.' ] = 2,
[ '(' ] = 3,
[ ')' ] = 4,
[ '\'' ] = 5,
[ ' ' ] = 6, ['\t'] = 6,
[ 0x0D ] = 8,
}
;

You see what I mean?

Okay, I'm stopping here. Probably longer than you expected, but
hopefully it was helpful.

luser droog

unread,

Feb 27, 2016, 2:11:15 AM2/27/16

to

On Friday, February 26, 2016 at 10:07:29 AM UTC-6, Tim Rentsch wrote:

> You see what I mean?

Yes, yes, yes! Thank you immensely much.

> Okay, I'm stopping here. Probably longer than you expected, but
> hopefully it was helpful.

Yes, I'm reworking all the code in light of these points.
It seems as though most of the places where I added one-
line comments and paragraph breaks in the middle of
functions are the precise places where I should consider
writing a sub-function. The function's name is essentially
a "free" comment.

Ian Collins

unread,

Feb 27, 2016, 2:17:57 AM2/27/16

to

luser droog wrote:

> It seems as though most of the places where I added one-
> line comments and paragraph breaks in the middle of
> functions are the precise places where I should consider
> writing a sub-function. The function's name is essentially
> a "free" comment.

If you recognise that, you have made a significant breakthrough!

--
Ian Collins

luser droog

unread,

Feb 27, 2016, 4:08:18 AM2/27/16

to

I've finished rewriting. I believe it incorporates every suggestion.
But I've modified the style, of course. I also made an improvement, I think,
to the collapsing of adjacent numbers by using the more appropriate function
`scalar` to create the one-element array when concatenating. `vector` works
for this, but it looked funny once it was more visible.

Some change in the memory layout also triggered a bug in the parser
with an uninitialized `int j`, so I'm watching out for that a little
more closely now.

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.h

// construct a new object fron n chars starting at s

// state indicates the type of object to construct

int newobj(int *s, int n, int state);

// scan up to n chars from s and produce 1D array of encoded expression

array scan_expression(int *s, int n);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd_private.h
/*
* The transition table and state set
*
* Each state embodies a certain amount of "knowledge"
* about what sort of token has been encountered.
* The dot character '.' causes a great deal of trouble
* since it is heavily overloaded. If the dot has a digit
* on either or both sides, then it is considered a decimal
* point separating the integer and fractional parts of a
* floating-point number. TODO: add floating-point numbers.
* Otherwise, the dot is considered part of an identifier.
*
* Note, the state snum codes are 10* the corresponding table index.
*/

enum state {
ini=0, //indeterminate
dot=10, //initial dot .
num=20, //integer 0
dit=30, //medial dot 0.
fra=40, //fraction 0.0
str=50, //initial quote '
quo=60, //end or escape quote 'aaa''
oth=70, //identifier or other symbol a+
dut=80, //trailing dot +.
sng=90, //copula or other self-delimiting symbol ()
};

int wdtab[][sizeof "0.'() <r"] = {
/*state*/
/*|*//* character class*/
/*V*//* none 0-9 . ' ( ) sp <- \r */

/*0*/ { oth+2, num+2, dot+2, str+2, sng+2, sng+2, ini+0, sng+2, ini+0 },
/*10*/{ oth+0, fra+0, oth+0, str+1, ini+1, ini+1, ini+1, sng+1, ini+1 },
/*20*/{ oth+1, num+0, dit+0, str+1, oth+1, oth+1, ini+1, sng+1, ini+1 },
/*30*/{ oth+0, num+1, dut+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*40*/{ oth+1, fra+0, dot+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*50*/{ str+0, str+0, str+0, quo+0, str+0, str+0, str+0, str+0, ini+1 },
/*60*/{ oth+1, num+1, dot+1, str+0, oth+1, oth+1, ini+1, sng+1, ini+1 },
/*70*/{ oth+0, num+1, dut+0, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*80*/{ oth+0, num+3, dut+0, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
/*90*/{ oth+1, num+1, dot+1, str+1, sng+1, sng+1, ini+1, sng+1, ini+1 },
};

token newobj(int *s, int n, int state);
token *collapse_adjacent_numbers_if_needed(token *p);
unsigned char character_class(int ch);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.c

/*
* Word Formation (scanning)
*
* As shown in en.c and en.h, the "encoding module",
* characters are stored in 24bits of a 32bit
* int, so Unicode characters are referred-to by their UCS4 code.
* This decision affects the scanner code in that it must deal
* with "int-strings" although the contents are expected to
* primarily be restricted to the ascii domain. One special char
* recognized by the scanner is the left-arrow char ← which is
* used for assignment of values to variables.
*
* The scanner is also unusual in that it treats most characters
* as identifier characters, even the punctuation chars which
* designate functions. These identifiers are resolved later
* during symbol-lookup using prefix-matching to further scan
* and parse the identifiers. For the current purpose of these
* functions, it is sufficient to distinguish numbers from non-
* numbers and to ensure that certain special characters like
* the left-arrow and the parens are encoded as single-chars
* and not parts of identifiers.
*
* So it's a state-machine that runs through each character
* of the int-string. The character is classified into a
* character-class which determines the column of the big table.
* The current state (initially 0 or "ini") determines
* the row of the big table. The value in the table encodes
* a new state (the 10s value) and an action (the 1s value).
* The action code adjusts the start-of-token position in
* the strings and can trigger the generation of a new token.
* The new token is packed into an integer handle and simply
* appended to the array structure to be returned.
*
* The state-machine itself is "programmed" by the table and
* enum definitions in wd_private.h.
*/

#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "ar.h" // array type
#include "en.h" // atomic encoding

#include "wd.h"

typedef int token;
typedef char state;
typedef char state_and_action_code;

#include "wd_private.h"

array scan_expression(int *s, int n){
array result = array_new(n+1);
token *p = result->data, *const p1 = p+1;
state ss, st; /* last state, current state */
state_and_action_code cc;
int i,j;

for (i=j=0, ss=st=0; i < n; i++, ss=st, st=cc/10){
cc = wdtab[st][ character_class(s[i]) ];

switch (cc%10){

case 0: /* do nothing */
break;

case 1: *p++ = newobj(s+j, i-j, st*10);

j=i;
break;

case 2: j=i;

break;

case 3: *p++ = newobj(s+j, i-1-j, ss*10);
j=i-1;
break;
}

if (p > p1) p=collapse_adjacent_numbers_if_needed(p);
}

result->dims[0] = p - result->data; // set actual encoded length
return result;
}

token *collapse_adjacent_numbers_if_needed(token *p){
if (gettag(p[-2])==LITERAL && gettag(p[-1])==LITERAL){
--p;
p[-1]=cache(ARRAY, vector(p[-1],p[0]));
}
if (gettag(p[-2])==ARRAY && gettag(p[-1])==LITERAL){
--p;
p[-1]=cache(ARRAY, cat(getptr(p[-1]), scalar(p[0])));
}
return p;
}

static unsigned char cctab[64] = {
['0']=1, ['1']=1, ['2']=1, ['3']=1, ['4']=1,
['5']=1, ['6']=1, ['7']=1, ['8']=1, ['9']=1,
['.']=2,
['(']=3,
[')']=4,
['\'']=5,
[' ']=6, ['\t']=6,
[0x0D]=8,
};

unsigned char character_class(int ch){
return ch<64? cctab[ch] :
ch==0x2910? 7 :
0;
}

token new_numeric(int *s, int n){
char buf[n+1];
for (int i=0; i<n; i++) buf[i] = s[i];
buf[n] = 0;

char *p;
token t = newdata(LITERAL, strtol(buf,&p,10));

if (*p) {
array z = scalar(t);
while(*p) {
int u = newdata(LITERAL, strtol(p,&p,10));
z = cat(z, scalar(u));
}
t = cache(ARRAY, z);
}
return t;
}

token new_string(int *s, int n){
array t=array_new(n);

for (int i=0; i<n; i++)

*elem(t,i) = newdata(CHAR, s[i]);
return cache(ARRAY, t);
}

token new_executable(int *s, int n){

if (n==1){
if (*s == '(') return newdata(LPAROBJ, 0);

if (*s == ')') return newdata(RPAROBJ, 0);
return newdata(PCHAR, *s);
} else {

array t=array_new(n);

for (int i=0; i<n; i++)

*elem(t,i) = newdata(PCHAR, s[i]);
return cache(PROG, t);
}
}

token newobj(int *s, int n, int state){
switch (state){
case num:
case dit: return new_numeric(s, n);

case quo:
case str: return new_string(s, n);

case ini:
case dot:
case dut:
case oth:

case sng: return new_executable(s, n);

default: return null;
}
}

josh@LAPTOP-ILO10OOF ~/inca/olmec
$

BartC

unread,

Feb 27, 2016, 7:14:54 AM2/27/16

to

On 27/02/2016 09:08, luser droog wrote:

> array scan_expression(int *s, int n){
> array result = array_new(n+1);
> token *p = result->data, *const p1 = p+1;
> state ss, st; /* last state, current state */
> state_and_action_code cc;
> int i,j;
>
> for (i=j=0, ss=st=0; i < n; i++, ss=st, st=cc/10){
> cc = wdtab[st][ character_class(s[i]) ];
>
> switch (cc%10){
> case 0: /* do nothing */
> break;
>
> case 1: *p++ = newobj(s+j, i-j, st*10);
> j=i;
> break;
>
> case 2: j=i;
> break;
>
> case 3: *p++ = newobj(s+j, i-1-j, ss*10);
> j=i-1;
> break;
> }
>
> if (p > p1) p=collapse_adjacent_numbers_if_needed(p);
> }
>
> result->dims[0] = p - result->data; // set actual encoded length
> return result;
> }

I tweaked this function to be more 'bart'-style: I don't like executable
statements lurking in shady corners; I like for-loops to do one job of
iteration and nothing else; I avoid comma operators; and I can't see the
point of 'const' (which I believe ought to have been strictly applied to
'array' too, but it anyway stops me re-arranging the initial assignments).

Note that these appear to be minority views in this group.

array scan_expression(int *s, int n){

array result;
token *p, *p1;

state ss, st; /* last state, current state */
state_and_action_code cc;
int i, j;

result = array_new(n+1);
p = result->data;
p1 = p+1;
j = 0;

ss = st = 0;

for (i=0; i < n; i++) {

cc = wdtab[st][character_class(s[i])];

switch (cc%10) {
case 0:

break;

case 1:
*p++ = newobj(s+j, i-j, st*10);

j = i;
break;

case 2:
j = i;

break;

case 3:
*p++ = newobj(s+j, i-1-j, ss*10);
j = i-1;
break;
}

if (p > p1)
p = collapse_adjacent_numbers_if_needed(p);

ss = st;
st = cc/10;
}

result->dims[0] = p - result->data; /* .... */
return result;
}

(Code not compiled.)

--
Bartc

luser droog

unread,

Feb 27, 2016, 7:17:08 PM2/27/16

to

That does seem easier to digest line-by-line, but it
changes it from just under a screenfull to just over a
screenfull so it makes it harder to grasp the gestalt
IMO.

luser droog

unread,

Feb 27, 2016, 9:01:53 PM2/27/16

to

I've re-written the parsing and execution functions,
attempting to apply the same ideas. There is now a private
header with the table and other "gobbledegook". And now
just functions in the .c file, with the big one up front.

Execution in APL proceeds right-to-left and this is
accomplished with a relatively straightforward algorithm.
We have 2 stacks (it could also be done with a queue and
a stack) called the left-stack and the right-stack.
The left stack starts at the left edge and expands
on the right.

|- 0 1 2 3 top

The right stack is the opposite, anchored at the right
and growing to the left.

top 3 2 1 0 -|

Of course this are just conceptual distinctions: they're
both just stacks. The left stack is initialized with a
mark object to indicate the left edge, and then
the entire expression. The right stack has a single
null object to indicate the right edge.

|-2*1+⍳4 -|

At each step, we A) move one object to the right stack,

|-2*1+⍳ 4-|
|-2*1+ ⍳4-|
|-2*1 +⍳4-|

If there are at least 4 objects on the right stack, then
classifies the top 4 elements with a set of predicate
functions and then it checks through the list of
grammatical patterns, the above case matching this
production,

/* p[0] p[1] p[2] p[3] func pre x y z post,2*/\

_(L1, EDGE+AVN, VRB, VRB, NOUN, monad, -1, 2,3,-1, 1, 0) \

application of a monadic verb. The numbers in the production indicate
which elements should be preserved, and which should be passed to the
handler function. The result from the handler function is interleaved
back onto the right stack.

|-2*1 +A-|

where A represents the array object returned from the iota function.
(Incidentally this is a lazy array, generating its values on-demand.)

Eventually the expression ought to reduce to 3 objects: a mark,
some result object, and a null. Anything else is an error
TODO handle this error.

I'll give all the header files again, to keep this message self-contained.

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ !.
./olmec
50331691(3,2b) = 134217728(8,0)
50331693(3,2d) = 134217729(8,1)
50331690(3,2a) = 134217730(8,2)
50340724(3,2374) = 134217731(8,3)
50331683(3,23) = 134217732(8,4)
50340723(3,2373) = 134217733(8,5)
50331771(3,7b) = 134217734(8,6)
50331692(3,2c) = 134217735(8,7)
50331707(3,3b) = 134217736(8,8)
50331686(3,26) = 150994944(9,0)
50331712(3,40) = 150994945(9,1)
2*1+⍳4
0032 002a 0031 002b 2373 0034 000d 0000

1
5
2(0,2) 50331690(3,2a) 1(0,1) 67108864(4,0) 4(0,4)
0x0
->4(0,4)
->67108864(4,0)
lookup
1
==134217733(8,5)
->134217728(8,0)
->1(0,1)
match 1
monad
->50331690(3,2a)
lookup
==134217730(8,2)
match 3
dyad
->2(0,2)
->167772160(10,0)
match 3
dyad
83886080(5,0)
1
4
0: 2(0,2)
1: 4(0,4)
2: 6(0,6)
3: 8(0,8)

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ vi ex_private.h

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ head ex.c

#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "ar.h"
#include "en.h"
#include "st.h"
#include "wd.h"

#include "vb.h"

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ar.h
#ifndef AR_H_
#define AR_H_
#include "../ppnarg.h"

typedef struct ar {
int type;
int rank; // number of dimensions
int *dims; // size of each dimension
int cons; // constant term of the indexing formula
int *weight; // corresponding coefficient in the indexing formula
int *data; // address of first array element
int *(*func)(struct ar *,int); // data function (if function type)
} *array;

enum type {
normal,
indirect,
function
};

int productdims(int rank, int dims[]);
array array_new_dims(int rank, int dims[]);
array array_new_function(int rank, int dims[],

int *data, int datan, int *(*func)(array,int)); // type=function
int *constant(array a,int idx);
int *j_vector(array a,int idx);

void loaddimsv(int rank, int dims[], va_list ap);
array (array_new)(int rank, ...);
#define array_new(...) (array_new)(PP_NARG(__VA_ARGS__),__VA_ARGS__)

array cast_dims(int data[], int rank, int dims[]); // type=indirect
array (cast)(int data[], int rank, ...); // type=indirect
#define cast(data,...) (cast)(data,PP_NARG(__VA_ARGS__),__VA_ARGS__)
array clone(array a); // type=indirect

array copy(array a);

int *elema(array a, int ind[]);
int *elemv(array a, va_list ap);
int *elem(array a, ...);

int *vector_index(int ind, int dims[], int n, int vec[]);
int ravel_index(int vec[], int dims[], int n);

void transpose2(array a);
void transpose(array a, int shift);
void transposea(array a, int spec[]);

array slice(array a, int i); // type=indirect
array slicea(array a, int spec[]); // type=indirect
array slices(array a, int s[], int f[]); // type=indirect
array extend(array a, int extra); // type=indirect

array cat(array x, array y);

array iota(int n); // type=function

$ cat st.h
/* symbol table */

typedef struct st {
int key;
int val;
int n;
struct st **tab /*[n]*/ ;
} *symtab;

symtab makesymtab(int n);
/* mode=0: prefix match
mode=1: defining search */
symtab findsym(symtab st, int **spp, int *n, int mode);
void def(symtab st, int name, int v);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat wd.h

// scan up to n chars from s and produce 1D array of encoded expression

array scan_expression(int *s, int n);

josh@LAPTOP-ILO10OOF ~/inca/olmec

$ cat vb.h

#define VERBTAB(_) \
/*base monad dyad f g h mr lr rr*/ \
_('+', vid, vplus, 0, 0, 0, 0, 0, 0 ) \
_('-', vneg, vminus, 0, 0, 0, 0, 0, 0 ) \
_('*', vsignum, vtimes, 0, 0, 0, 0, 0, 0 ) \
_(0x2374/*rho*/, vshapeof, vreshape, 0, 0, 0, 0, 0, 0 ) \
_('#', vtally, 0, 0, 0, 0, 0, 0, 0 ) \
_(0x2373/*iota*/, viota, 0, 0, 0, 0, 0, 0, 0 ) \
_('{', vhead, vtake, 0, 0, 0, 0, 1, 0 ) \
_(',', vravel, vcat, 0, 0, 0, 0, 0, 0 ) \
_(';', vraze, vlink, 0, 0, 0, 0, 0, 0 ) \
/**/
typedef struct verb {
int id;
int (*monad)(int,struct verb*);
int (*dyad)(int,int,struct verb*);
int f,g,h; /* operator arguments */
int mr,lr,rr; /* monadic,left,right rank*/
} *verb;

void init_vb(symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec

$ cat ex.h

// predicate table contains predicate functions
// and associated enum values

#define PREDTAB(_) \
_( ANY = 1, qa, 1 ) \
_( VAR = 2, qp, gettag(x)==PROG \

|| (gettag(x)==PCHAR && getval(x)!=0x2190 /*leftarrow*/ ) ) \

_( NOUN = 4, qn, gettag(x)==LITERAL \
|| gettag(x)==CHAR \
|| gettag(x)==ARRAY ) \
_( VRB = 8, qv, gettag(x)==VERB ) \

_( DEX = 16, qx, 0 ) /*dextri-monadic verb*/\
_( ADV = 32, qo, gettag(x)==ADVERB && ((verb)getptr(x))->monad ) \
_( LEV = 64, qe, 0 ) /*sinister adverb*/\
_( CONJ = 128, qj, gettag(x)==ADVERB && ((verb)getptr(x))->dyad ) \
_( MARK = 256, qm, gettag(x)==MARKOBJ ) \
_( ASSN = 512, qc, gettag(x)==PCHAR && getval(x) == 0x2190 ) \
_( LPAR = 1024, ql, gettag(x)==LPAROBJ ) \
_( RPAR = 2048, qr, gettag(x)==RPAROBJ ) \
_( NUL = 4096, qu, gettag(x)==NULLOBJ ) \
/**/

// declare predicate functions
#define PRED_DECL(X,Y,...) int Y(int);
PREDTAB(PRED_DECL)

// declare predicate enums and composed patterns

#define PRED_ENUM(X,...) X,
enum predicate { PREDTAB(PRED_ENUM)
EDGE = MARK+ASSN+LPAR,
AVN = VRB+NOUN+ADV };

// execute an expression e with environment st

int execute_expression(array e, symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex_private.h

/* stack type
the size is generously pre-calculated
and so we can skip all bounds checking.
stkp->top is the size (index of next empty slot for next push)
stkp->top-1 is the topmost element
*/

typedef struct stack { int top; object a[1];} stack; /* top==0::empty */

#define stackpush(stkp,el) ((stkp)->a[(stkp)->top++]=(el))
#define stackpop(stkp) ((stkp)->a[--((stkp)->top)])
#define stacktop(stkp) ((stkp)->a[(stkp)->top-1])

int parse_and_lookup_name(stack *lstk, stack *rstk, object x, symtab st);

/* predicate functions are instantiated according to the table
defined in the header.
the q[] function array is used by classify to apply all
predicate functions yielding a sum of all applicable codes
defined in the table. Specific qualities or combinations
may then be determined easily by masking.
*/

#define PRED_FUNC(X,Y,...) int Y(object x){ return __VA_ARGS__; }
PREDTAB(PRED_FUNC)
#define PRED_ENT(X,Y,...) Y,
static int (*q[])(object) = { PREDTAB(PRED_ENT) };

/* encode predicate applications into a binary number

which can be compared to a pattern with a mask */

static inline int classify(object x){

int i,v,r;
for (i=0, v=1, r=0; i<sizeof q/sizeof*q; i++, v*=2)
if (q[i](x))
r |= v;
return r;
}

// the Parse Table defines the grammar of the language
// At each stack move, the top four elements of the right stack
// are checked against each of these patterns. A matching pattern

// returns element t[pre] from the temp area to the right stack

// then calls func(t[x],t[y],t[z]) and pushes the result to the
// right stack, then pushes t[post] and t[post2].
#define PARSETAB(_) \

/* p[0] p[1] p[2] p[3] func pre x y z post,2*/\

_(L0, EDGE, VRB, NOUN, ANY, monad, 3, 1,2,-1, 0,-1) \
_(L1, EDGE+AVN, VRB, VRB, NOUN, monad, -1, 2,3,-1, 1, 0) \

_(L2, ANY, NOUN, DEX, ANY, monad, 3, 2,1,-1, 0,-1) \
_(L3, EDGE+AVN, NOUN, VRB, NOUN, dyad, -1, 1,2,3, 0,-1) \
_(L4, EDGE+AVN, NOUN+VRB, ADV, ANY, adv, 3, 1,2,-1, 0,-1) \
_(L5, ANY, LEV, NOUN+VRB, ANY, adv, 3, 2,1,-1, 0,-1) \
_(L6, EDGE+AVN, NOUN+VRB, CONJ, NOUN+VRB, conj_, -1, 1,2,3, 0,-1) \
_(L7, VAR, ASSN, AVN, ANY, spec, 3, 0,2,-1, -1,-1) \
_(L8, LPAR, ANY, RPAR, ANY, punc, 3, 1,-1,-1, -1,-1) \
_(L9, MARK, ANY, RPAR, ANY, punc, 3, 1,-1,-1, 0,-1) \
_(L10,ANY, LPAR, ANY, NUL, punc, 3, 2,-1,-1, 0,-1) \
/**/

// generate labels to coordinate table and execution

#define PARSETAB_INDEX(label, ...) label,
enum { PARSETAB(PARSETAB_INDEX) };

// create parsetab array of structs containing the patterns

#define PARSETAB_PAT(label, pat1, pat2, pat3, pat4, ...) \
{pat1, pat2, pat3, pat4},

typedef struct parsetab { int c[4]; } parsetab;
static parsetab ptab[] = { PARSETAB(PARSETAB_PAT) };

int check_pattern(int *c, parsetab *ptab, int i);

// perform the grammar production, transforming the stack

#define PARSETAB_ACTION(label,p1,p2,p3,p4, func, pre,x,y,z,post,post2) \
case label: { \
if (pre>=0) stackpush(rstk,t[pre]); \
stackpush(rstk,func(x>=0?t[x]:0,y>=0?t[y]:0,z>=0?t[z]:0,st)); \
if (post>=0) stackpush(rstk,t[post]); \
if (post2>=0) stackpush(rstk,t[post2]); \
} break;

size_t sum_symbol_lengths(array e, int n);

int monad(int f, int y, int dummy, symtab st);
int dyad(int x, int f, int y, symtab st);
int adv(int f, int g, int dummy, symtab st);
int conj_(int f, int g, int h, symtab st);

int spec(int name, int v, int dummy, symtab st);
int punc(int x, int dummy, int dummy2, symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.c

#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "ar.h"
#include "en.h"
#include "st.h"
#include "wd.h"

#include "vb.h"
#include "ex.h"

typedef int object;
#include "ex_private.h"

int check_pattern(int *c, parsetab *ptab, int i){
return c[0] & ptab[i].c[0]
&& c[1] & ptab[i].c[1]
&& c[2] & ptab[i].c[2]
&& c[3] & ptab[i].c[3];

}

// execute expression e using environment st and yield result

int execute_expression(array e, symtab st){

int n = e->dims[0];
int i,j;

object x;
stack *lstk,*rstk;
int docheck;

j=sum_symbol_lengths(e,n);

lstk=malloc(sizeof*lstk + (n+j+1) * sizeof*lstk->a);
lstk->top=0;
stackpush(lstk,mark);

for (i=0; i<n; i++)

stackpush(lstk,e->data[i]); // push entire expression to left stack

rstk=malloc(sizeof*rstk + (n+j+1) * sizeof*rstk->a);
rstk->top=0;
stackpush(rstk,null);

while(lstk->top){ //left stack not empty
x=stackpop(lstk);

printf("->%d(%d,%x)\n", x, gettag(x), getval(x));

if (qp(x)){ // x is a pronoun?
if (parse_and_lookup_name(lstk, rstk, x, st) == null)
return null;

} else {
stackpush(rstk,x);
}

docheck = 1;
while (docheck){ //check rstk with patterns and reduce
docheck = 0;
if (rstk->top>=4){ //enough elements to check?

int c[4];

for (j=0; j<4; j++)
c[j] = classify(rstk->a[rstk->top-1-j]);

//printf("%d %d %d %d\n", c[0], c[1], c[2], c[3]);

for (i=0; i<sizeof ptab/sizeof*ptab; i++){
if (check_pattern(c, ptab, i)) {
object t[4];

printf("match %d\n", i);

t[0] = stackpop(rstk);
t[1] = stackpop(rstk);
t[2] = stackpop(rstk);
t[3] = stackpop(rstk);
switch(i){
PARSETAB(PARSETAB_ACTION)
}
docheck = 1; //stack changed: check again
break;
}
}
}
}
}

//assemble results and return
//TODO check/handle extra elements on stack
//(interpolate?, enclose and cat?)
stackpop(rstk); // mark

x = stackpop(rstk);
free(lstk);
free(rstk);
return x;
}

size_t sum_symbol_lengths(array e, int n){
int i,j;
for (i=j=0; i<n; i++) { // sum symbol lengths

if (gettag(e->data[i])==PROG) {
//printf("%p\n", getptr(e->data[i]));
j+=((array)getptr(e->data[i]))->dims[0];
}
}

return j;

}

/* Parser Actions,
each function is called with x y z parameters defined in PARSETAB
*/

int monad(int f, int y, int dummy, symtab st){

printf("monad\n");
verb v = getptr(f);
return v->monad(y,v);
}

int dyad(int x, int f, int y, symtab st){

printf("dyad\n");
verb v = getptr(f);
return v->dyad(x,y,v);
}

int adv(int f, int g, int dummy, symtab st){

printf("adverb\n");
verb v = getptr(g);
return v->monad(f,v);
}

int conj_(int f, int g, int h, symtab st){

printf("conj\n");
verb v = getptr(g);
return v->dyad(f,h,v);
}

//specification

int spec(int name, int v, int dummy, symtab st){

def(st, name, v);

return v;
}

int punc(int x, int dummy, int dummy2, symtab st){
return x;
}

// lookup name in environment unless to the left of assignment
// if the full name is not found, but a defined prefix is found,
// push the prefix back to the left stack and continue lookup
// with remainder. push value to right stack.

int parse_and_lookup_name(stack *lstk, stack *rstk, object x, symtab st){

if (rstk->top && qc(stacktop(rstk))){ //assignment: no lookup
stackpush(rstk,x);
} else {

printf("lookup\n");

int *s;
int n;
switch(gettag(x)){

case PCHAR: { // single char

s = &x;
n = 1;
} break;

case PROG: { // longer name

array a = getptr(x);
s = a->data;
n = a->dims[0];
} break;
}
int *p = s;
symtab tab = findsym(st,&p,&n,0);

if (tab->val == null) {

printf("error undefined prefix\n");

return null;
}
while (n){ //while name

printf("%d\n", n);

stackpush(lstk,tab->val); //pushback value
s = p;

tab = findsym(st,&p,&n,0); //lookup remaining name

if (tab->val == null) {

printf("error undefined internal\n");

return null;
}
}
//replace name with defined value

luser droog

unread,

Feb 29, 2016, 3:00:10 AM2/29/16

to

On Saturday, February 27, 2016 at 8:01:53 PM UTC-6, luser droog wrote:
> On Friday, February 26, 2016 at 10:07:29 AM UTC-6, Tim Rentsch wrote:
> > luser droog <luser...@gmail.com> writes:
> >
> > > On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
> > >> luser droog <luser...@gmail.com> writes:
> > >>
> > >>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> > >>>
> > >>>> These modules both call functions exported from other modules. You can
> > >>>> find them via their header files from the repo:
> > >>>> https://github.com/luser-dr00g/inca/tree/master/olmec
> > >>>>

<snip>

Factored-out several more small functions, and now the main function
fits on one screen so you can grok the whole thing at once. And I
corrected and expanded the story/comment. Of course I've naturally
introduced some weirdness to accomplish this. :)

#define stackinit(stkp,sz) (stkp=malloc(sizeof*stkp + (sz)*sizeof*stkp->a)), \
(stkp->top=0)

#define stackpush(stkp,el) ((stkp)->a[(stkp)->top++]=(el))
#define stackpop(stkp) ((stkp)->a[--((stkp)->top)])
#define stacktop(stkp) ((stkp)->a[(stkp)->top-1])

// perform the grammar production, transforming the stack
#define PARSETAB_ACTION(label,p1,p2,p3,p4, func, pre,x,y,z,post,post2) \
case label: { \
if (pre>=0) stackpush(rstk,t[pre]); \
stackpush(rstk,func(x>=0?t[x]:0,y>=0?t[y]:0,z>=0?t[z]:0,st)); \
if (post>=0) stackpush(rstk,t[post]); \
if (post2>=0) stackpush(rstk,t[post2]); \
} break;

void init_stacks(stack **lstkp, stack **rstkp, array e, int n);
object extract_result_and_free_stacks(stack *lstk, stack *rstk);

int parse_and_lookup_name(stack *lstk, stack *rstk, object x, symtab st);

int check_pattern(int *c, parsetab *ptab, int i);

void move_top_four_to_temp(object *t, stack *rstk);

size_t sum_symbol_lengths(array e, int n);

int monad(int f, int y, int dummy, symtab st);
int dyad(int x, int f, int y, symtab st);
int adv(int f, int g, int dummy, symtab st);
int conj_(int f, int g, int h, symtab st);
int spec(int name, int v, int dummy, symtab st);
int punc(int x, int dummy, int dummy2, symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat ex.c

#if 0
Parsing and Execution

Execution in APL proceeds right-to-left and this is
accomplished with a relatively straightforward algorithm.
We have 2 stacks (it could also be done with a queue and
a stack) called the left-stack and the right-stack.
The left stack starts at the left edge and expands
on the right.

|- 0 1 2 3 top

The right stack is the opposite, anchored at the right
and growing to the left.

top 3 2 1 0 -|

Of course these are just conceptual distinctions: they're

both just stacks. The left stack is initialized with a

mark object (illustrated here as ^) to indicate the left
edge, followed by the entire expression. The right stack
has a single null object (illustrated here as $) to indicate
the right edge.

|-^2*1+⍳4 $-|

At each step, we A) move one object to the right stack,

|-^2*1+⍳ 4$-|

Until there are at least 4 objects on the right stack, we do
nothing else.

|-^2*1+ ⍳4$-|
|-^2*1 +⍳4$-|

If there are at least 4 objects on the right stack, then

we B) classify the top 4 elements with a set of predicate
functions and then check through the list of grammatical patterns,
but this configuration (VERB VERB NOUN NULL) doesn't match anything.
Move another object and try again.

|-^2* 1+⍳4$-|

Now, the above case (NOUN VERB VERB NOUN) matches this production:

/* p[0] p[1] p[2] p[3] func pre x y z post,2*/\
_(L1, EDGE+AVN, VRB, VRB, NOUN, monad, -1, 2,3,-1, 1, 0) \

application of a monadic verb. The numbers in the production indicate
which elements should be preserved, and which should be passed to the
handler function. The result from the handler function is interleaved
back onto the right stack.

|-^2* 1+A$-| A←⍳4

where A represents the array object returned from the iota function.
(Incidentally this is a lazy array, generating its values on-demand.)

|-^2 *1+A$-| dyad
|-^2 *B$-| B←1+A
|-^ 2*B$-|
|- ^2*B$-| dyad
|- ^C$-| C←2*B

Eventually the expression ought to reduce to 3 objects: a mark,
some result object, and a null. Anything else is an error
TODO handle this error.

|- ^C$-|
^
|
result

#endif

#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "ar.h"
#include "en.h"
#include "st.h"
#include "wd.h"
#include "vb.h"
#include "ex.h"

typedef int object;
#include "ex_private.h"

// execute expression e using environment st and yield result

//TODO check/handle extra elements on stack (interpolate?, enclose and cat?)

int execute_expression(array e, symtab st){

int i,j,n = e->dims[0];
stack *lstk,*rstk;
int docheck;

init_stacks(&lstk, &rstk, e, n);

while(lstk->top){ //left stack not empty

object x = stackpop(lstk);

printf("->%d(%d,%x)\n", x, gettag(x), getval(x));

if (qp(x)){ // x is a pronoun?
if (parse_and_lookup_name(lstk, rstk, x, st) == null)
return null;
} else stackpush(rstk,x);

docheck = 1;
while (docheck){ //check rstk with patterns and reduce
docheck = 0;
if (rstk->top>=4){

int c[4];
for (j=0; j<4; j++)
c[j] = classify(rstk->a[rstk->top-1-j]);

for (i=0; i<sizeof ptab/sizeof*ptab; i++)
if (check_pattern(c, ptab, i)) {
object t[4];

move_top_four_to_temp(t, rstk);

switch(i){
PARSETAB(PARSETAB_ACTION)
}
docheck = 1; //stack changed: check again
break;
}
}
}
}

return extract_result_and_free_stacks(lstk,rstk);

}

size_t sum_symbol_lengths(array e, int n){
int i,j;
for (i=j=0; i<n; i++) { // sum symbol lengths
if (gettag(e->data[i])==PROG) {
//printf("%p\n", getptr(e->data[i]));
j+=((array)getptr(e->data[i]))->dims[0];
}
}
return j;
}

void init_stacks(stack **lstkp, stack **rstkp, array e, int n){
int i,j;
#define lstk (*lstkp) /* by-reference */
#define rstk (*rstkp)
j=sum_symbol_lengths(e,n);
stackinit(lstk,n+j+1);
stackpush(lstk,mark);
for (i=0; i<n; i++) stackpush(lstk,e->data[i]); // push expression
stackinit(rstk,n+j+1);
stackpush(rstk,null);
#undef lstk
#undef rstk
}

object extract_result_and_free_stacks(stack *lstk, stack *rstk){
object x;
stackpop(rstk); // pop mark

x = stackpop(rstk);
free(lstk);
free(rstk);
return x;
}

int check_pattern(int *c, parsetab *ptab, int i){
return c[0] & ptab[i].c[0]
&& c[1] & ptab[i].c[1]
&& c[2] & ptab[i].c[2]
&& c[3] & ptab[i].c[3];
}

void move_top_four_to_temp(object *t, stack *rstk){

t[0] = stackpop(rstk);
t[1] = stackpop(rstk);
t[2] = stackpop(rstk);
t[3] = stackpop(rstk);
}

Rosario19

unread,

Feb 29, 2016, 1:56:59 PM2/29/16

to

On Sun, 28 Feb 2016 23:59:48 -0800 (PST), luser droog wrote:

>#include "ar.h"
>#include "en.h"
>#include "st.h"
>#include "wd.h"
>#include "vb.h"
>#include "ex.h"

i know to be again all
but why are necessary too much headers?

for what i have seen it is only necessary one header for compile the
.dll file

and one header for the .c program that call .dll functions...

Rosario19

unread,

Feb 29, 2016, 1:58:54 PM2/29/16

to

i'm for the monofile: one gigant .dll that has all the functions
possible...

Rosario19

unread,

Feb 29, 2016, 2:14:43 PM2/29/16

to

On Mon, 29 Feb 2016 19:58:46 +0100, Rosario19 wrote:

>i'm for the monofile: one gigant .dll that has all the functions
>possible...

one gigant .dll in each programming language one use
and one header for each programming language for call these functions
i think i had done that for [C/]C++ and assembly languages

luser droog

unread,

Feb 29, 2016, 3:46:47 PM2/29/16

to

The functions are separated into
logically-related "modules". It
promotes better organization of
the program as a whole.

Tim Rentsch

unread,

Mar 3, 2016, 11:40:47 AM3/3/16

to

luser droog <luser...@gmail.com> writes:

> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
>> luser droog <luser...@gmail.com> writes:
>>
>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
>>>> Alright, so maybe my last post wasn't interesting enough. Perhaps not
>>>> hard-core C hacksumschit enough. So here ya you. I dare you to find a
>>>> bug. I dare you to even make sense of this table-driven madness.
>>>>
>>>> These modules both call functions exported from other modules. You can
>>>> find them via their header files from the repo:
>>>> https://github.com/luser-dr00g/inca/tree/master/olmec
>>>>

>>>> [...]
>
> [..updated example, about 400 lines..]

It looks like you have posted several updates. I don't want to go
through them and figure out what's what. If you send me a tarball
or a link to a tarball in email, or post a link to a tarball here
in the newsgroup, I'll try to take a look at it. I don't want to
go through github because I want to be sure we are in sync about
what is being looked at; I don't want to deal with a moving target,
or do any fussing around in an attempt to avoid a situation where
the target might move. Okay?

luser droog

unread,

Mar 3, 2016, 2:59:10 PM3/3/16

to

Understood. In anticipation of this, I did change the subject line.
So there should be one body of code with the subject that says
just "Parser" but not "Scanner and Parser". That represents my
best effort so far. But I neglected to list the supplementary
headers there.

That said, I have made a few small changes while working with
other files. So, perhaps better to use this link:
https://github.com/luser-dr00g/inca/archive/99518e3d2b82751b5d9910b341c61fda14fcbf2e.zip
which is the current commit complete project, but will not change
if/when I add further commits.

The files of interest for the parsing are:
olmec/ex.h
olmec/ex.c
olmec/ex_private.h
and additional referenced headers:
olmec/ar.h array
olmec/en.h encoding
olmec/st.h symbol table
olmec/wd.h scanner
olmec/vb.h verbs (operators)
olmec/debug.h printf macro

If needed, I could make an extract of just these files. But I'd
need to find somewhere to upload it. I've made a typescript
listing these files as a gist:
https://gist.github.com/luser-dr00g/a2c35f698e1f5f4e23b0
which is also a stable link.

The ex.c file refers to "adverbs" as a distinct type, but these
use exactly the same structure as verbs, and are accessed through
the symbol table, thus there was no need to include the "av.h"
header at all, which only exposes an init_ function:

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ cat av.h

void init_av(symtab st);

josh@LAPTOP-ILO10OOF ~/inca/olmec
$

Tim Rentsch

unread,

Mar 3, 2016, 4:17:34 PM3/3/16

to

luser droog <luser...@gmail.com> writes:

> On Friday, February 26, 2016 at 10:07:29 AM UTC-6, Tim Rentsch wrote:
>
>> You see what I mean?
>
> Yes, yes, yes! Thank you immensely much.

No point to this reply except to say I'm glad my comments
were of help.

Tim Rentsch

unread,

Mar 3, 2016, 5:18:01 PM3/3/16

to

luser droog <luser...@gmail.com> writes:

> On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
>> luser droog <luser...@gmail.com> writes:
>>
>>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
>>>> luser droog <luser...@gmail.com> writes:
>>>>
>>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:

> [what code to look at?]

>
> perhaps better to use this link:
> https://github.com/luser-dr00g/inca/archive/99518e3d2b82751b5d9910b341c61fda14fcbf2e.zip
> which is the current commit complete project, but will not change

> if/when I add further commits. [...]

Okay, I will try to take a look when I can.

luser droog

unread,

Mar 7, 2016, 3:08:11 AM3/7/16

to

On Thursday, March 3, 2016 at 4:18:01 PM UTC-6, Tim Rentsch wrote:
> luser droog <luser...@gmail.com> writes:
>
> > On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
> >> luser droog <luser...@gmail.com> writes:
> >>
> >>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
> >>>> luser droog <luser...@gmail.com> writes:
> >>>>
> >>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> > [what code to look at?]
> >
> > perhaps better to use this link:
> > https://github.com/luser-dr00g/inca/archive/99518e3d2b82751b5d9910b341c61fda14fcbf2e.zip
> > which is the current commit complete project, but will not change
> > if/when I add further commits. [...]
>
> Okay, I will try to take a look when I can.

Awesome. I've also posted the array-handling code
over at
http://codereview.stackexchange.com/questions/122038/ndarrays-in-c-slicing-transposing-polynomials

luser droog

unread,

Mar 10, 2016, 9:56:34 PM3/10/16

to

On Thursday, March 3, 2016 at 4:18:01 PM UTC-6, Tim Rentsch wrote:
> luser droog <luser...@gmail.com> writes:
>
> > On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
> >> luser droog <luser...@gmail.com> writes:
> >>
> >>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
> >>>> luser droog <luser...@gmail.com> writes:
> >>>>
> >>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> > [what code to look at?]
> >
> > perhaps better to use this link:
> > https://github.com/luser-dr00g/inca/archive/99518e3d2b82751b5d9910b341c61fda14fcbf2e.zip
> > which is the current commit complete project, but will not change
> > if/when I add further commits. [...]
>
> Okay, I will try to take a look when I can.

If you haven't looked at it yet, I have made some small improvements
to the parser and extensive improvements to the rest of the files.
I'd still love to get more critique but your earlier effort may have
been sufficient to guide me further. The main parsing function itself
is now very short and very "high-level".

object execute_expression(array e, symtab st){

int n = e->dims[0];

stack *lstk,*rstk;

init_stacks(&lstk, &rstk, e, n);

while(lstk->top){ //left stack not empty

object x = stackpop(lstk);
DEBUG(0,"->%08x(%d,%d)\n", x, gettag(x), getval(x));

if (qprog(x)){ // x is a pronoun?
object y;
if ((y=parse_and_lookup_name(lstk, rstk, x, st)) != 0)
return y;
} else stackpush(rstk,x);

check_rstk_with_patterns_and_reduce(lstk, rstk, st);
}

return extract_result_and_free_stacks(lstk, rstk);
}

current commit, compilation and execution illustrated below:

https://github.com/luser-dr00g/inca/archive/1bb7a9169ea45304798ba6cc8104d2546ec8995d.zip

Filenames have been changed to longer, more meaningful words.
ex.c -> exec.c
wd.c -> lex.c
etc

And I've incorporated most of the suggestions from the array
review: http://codereview.stackexchange.com/questions/122038/ndarrays-in-c-slicing-transposing-polynomials
I still like my typedef'ed pointers.

Many of the verbs are only defined over vectors, but a small
handful are implemented. The quad-k variable contains
an illustration of the complete "ALT"-keyboard, accessed by holding
alt- and pressing the corresponding key. Capital letters and numbers
are the same in both keyboards, to help orientation.

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ touch *.c

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ make
cc -c -o main.o main.c
cc -c -o editor.o editor.c
cc -c -o encoding.o encoding.c
cc -c -o lex.o lex.c
cc -c -o exec.o exec.c
cc -c -o verbs.o verbs.c
cc -c -o adverbs.o adverbs.c
cc -c -o xverb.o xverb.c
cc -c -o print.o print.c
cc -c -o io.o io.c
cc -c -o array.o array.c
cc -c -o qn.o qn.c
cc -c -o symtab.o symtab.c
cc -o olmec main.o editor.o encoding.o lex.o exec.o verbs.o adverbs.o xverb.o print.o io.o array.o qn.o symtab.o
m4 -D UNITS="qn io array symtab" all_tests.m4 >all_tests.c
cc -o all_tests all_tests.c encoding.o
cc io_test.c -o io_test
cc array_test.c -o array_test
cc qn_test.c -o qn_test
cc -o symtab_test symtab_test.c encoding.o
---------------
running qn_test
10 2 0 (2, 1)
10 2 0 (7, 9)
10 5 1 (9, 0, 1, 0, 0, 0)
10 4 1 (9, 0, 1, 0, 0)
10 0 0 ()
10 2 0 (3, 3)
10 5 1 (7, 6, 9, 9, 9, 9)
10 4 1 (7, 6, 9, 9, 9)
ALL TESTS PASSED
Tests run: 2
---------------
running io_test
ALL TESTS PASSED
Tests run: 4
---------------
running array_test
ALL TESTS PASSED
Tests run: 1
---------------
running symtab_test
0
ALL TESTS PASSED
Tests run: 2
Grand Total tests run: 9

josh@LAPTOP-ILO10OOF ~/inca/olmec
$ ./olmec
⎕k
·¨¯<≤=≥>≠¨²_÷
◆1234567890-×
QWERTYUIOP→£≠
?⍵∈⍴∾↑↓⍳○⋆←]⍀
ASDFGHJKL:"
⍺⌈⌊_∇∆∘'⎕º´
ZXCVBNM«»¿
⊂⊃∩∪⊥⊤|¶·⌿

a←⍳12
0 1 2 3 4 5 6 7 8 9 10 11
b←3 2 3⍴a
0 1 2
3 4 5

6 7 8
9 10 11

0 1 2
3 4 5

×⍀(4⍴3),1
81 27 9 3 1
'Hello, World!'
Hello, World!

josh@LAPTOP-ILO10OOF ~/inca/olmec
$

Tim Rentsch

unread,

Mar 16, 2016, 2:46:23 PM3/16/16

to

I'm close to having some comments but I think you have
posted something else in the meantime, so let me look
at that before posting further.

luser droog

unread,

Mar 16, 2016, 11:58:57 PM3/16/16

to

Looking forward to it. I forgot to mention anywhere that
I tried to combine two of your suggestions in a clever way.
I added the `typedef int token;` lines to the .c file, just
before #including the *_private.h header which uses the
typedefs. So they serve a double-duty as symbolic "guards"
of the private parts. So another file cannot simply
#include the *_private.h without a lot of extra work.

Tim Rentsch

unread,

Mar 17, 2016, 4:27:38 PM3/17/16

to

luser droog <luser...@gmail.com> writes:

> On Thursday, March 3, 2016 at 4:18:01 PM UTC-6, Tim Rentsch wrote:
>> luser droog <luser...@gmail.com> writes:
>>
>>> On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
>>>> luser droog <luser...@gmail.com> writes:
>>>>
>>>>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
>>>>>> luser droog <luser...@gmail.com> writes:
>>>>>>
>>>>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
>>>
>>> [what code to look at?]
>>>

>>> perhaps better to use this link: [...]

>>
>> Okay, I will try to take a look when I can.

I have only just looked at this larger posting, so I don't have
much to say about it yet, but let me give what comments I have
from the previous iteration. I'm going to select/rearrange
pieces of what follows for comments.

> Filenames have been changed to longer, more meaningful words.
> ex.c -> exec.c
> wd.c -> lex.c
> etc

Good, good to hear.

> And I've incorporated most of the suggestions from the array

> review: [..link..]

> I still like my typedef'ed pointers.

I haven't had a chance to look at this yet, so I don't know what
was discussed.

> Many of the verbs are only defined over vectors, [...]

I noticed that, but the level of detail is farther down than
where I got, so right now I have nothing more to say about it.

Now we start my comments for the code of the previous link.
Forgive me if what I have to say isn't very specific in some
places, I hope it will be helpful despite that. Also some of my
suggestions could be taken as "style" advice, but I mean them
more seriously than just a stylistic preference. The comments
will start at a high level but should get more detailed and more
specific as we go on, so please bear with me. Okay, here we go...

I found the code very tangled. Somewhat more concretely, I can't
help the feeling that to understand any of it I need to understand
all of it. This reaction improved as I looked more, but never got
to the point where I got past it. Let me see if I can make some
constructive suggestions.

First: organize, modularize, and encapsulate. There are a few
types that are used basically everywhere. Fortunately, these
types can mostly be made opaque types, with access to contents
being restricted to one or sometimes two .c files. The types
I'm talking about are (using slightly different struct tags):

typedef struct array_s *array;
typedef struct symbol_table *symtab;
typedef struct verb_s *verb;

plus the enumeration type that defines LITERAL, NUMBER, etc. Put
these type definitions in a common header ("common.h", perhaps),
along with #include lines for any standard C headers that don't
involve functions (eg, <stddef.h>, <stdarg.h>, <limits.h>, etc) or
are common enough so they may be considered ubiquitous. Add an
include line for "common.h" (or whatever you decide to call it) at
the start of every other header file.

Besides these few common types, the code is pretty helter-skelter
about type definitions. Some things that should have typedefs
do not, and at least one could be eliminated. About a third
of the typedef names are defined in .c files. I strongly
recommend putting /all/ typedefs in header files (_private headers
in some cases), even you want to enforce a restriction that a
particular type may be used only in one .c file.

Incidentally, on the subject of names: any name that flows across
an interface should (under my recommendation) be made up of whole
English words, never shortenings like "tab" or "ptr". This rule
always includes all type names and all struct/union members.
There can be exceptions to this rule in exceptional cases, like
"U32" for a 32-bit unsigned integer type, but the exceptions
never include abbreviations - if it's important enough to
indicate a word, use the whole word. Example: rather than 'dims'
I suggest 'extents'.

Getting back to types, some types that should be given type names
but aren't are what I'm calling here 'Monadic' and 'Dyadic' (using
the type name 'Datum' for what I think you call 'object'):

typedef Datum (*Monadic)( Datum, verb );
typedef Datum (*Dyadic)( Datum, Datum, verb );

These types occur several places throughout the code, but always
written out, never with a type name. That should be corrected.

A type name that could be eliminated is 'parsetab'. The type of
this name is really needed only to define the object 'ptab'. That
object is given as an argument to check_pattern(), but since that
function has only one caller the 'ptab' argument can be eliminated
and the function just use ptab directly. (In fact, since 'ptab'
is constructed using the PARSETAB macro, and the only other uses
of it are to determine its extent, the definition of 'ptab' could
be moved to inside the check_pattern() function, with its extent
being provided by a different name derived via a PARSETAB call.)

There are a couple of types used in wd.c:

typedef char state;
typedef char state_and_action_code;

These should be 'int' rather than 'char' (and in fact I would
probably make them 'unsigned' rather than 'int' unless there was
some other reason not to).

The type name 'object' (a synonym for int) is used in some places
to mean a value suitable for gettag(), etc. Unfortunately the name
'object' isn't used consistently, so sometimes we see 'object' and
sometimes we see 'int'. I tried to straighten this out with a new
type name, 'Datum', and making it a struct to get help from the
compiler, but ended up going around in circles changing 'int' to
'Datum' in some places, only to have to change it back again
because the values was used like a regular int! It looks to me
like there is some fundamental confusing here, and it may be
possible to produce an 'int' value (eg, with a multiplication) that
later ends up being interpred as an 'object' with some weird tag
value. Not good! I'm sorry I can't shed more light on what to do
about that.

I need to say more about defining the contents for the abstract
types mentioned above, but let me do that further on.

Moving on to macros... I see what you're doing with macros like,
eg, PREDTAB, and it isn't too bad once I got the hang of it. But
it could be easier if these kinds of macros were given better names
and definitions, and their expansions were more localized. These
macros aren't really tables but more like a kind of compile-time
iterator. Here is an example:

#define PREDICATES_FOREACH( WHAT ) \
WHAT( ANY = 1, qa, 1 ) \
/* ... */ \
/*** end of PREDICATES_FOREACH ***/

// ...

enum predicate {
#define ENUMERATOR( name_and_value, ... ) name_and_value,
PREDICATES_FOREACH( ENUMERATOR )
#undef ENUMERATOR
EDGE = MARK + ASSN + LPAR,
AVN = VRB + NOUN + ADV
};

The combination of a more suggestive name, giving a name to the
function argument, and localizing the "call" to the iterator, to me
makes it easier to see what's going on. I recommend making changes
along these lines to PREDTAB() and other similar macros like this,
namely, ADVERBTAB(), ALPHATAB(), PARSETAB(), and VERBTAB().

For other macros.. some are not too bad, but others... whew!
Let's see... some of the shorter macros are "convenience" macros,
and I think are better left out. Examples are 'lstk' and 'rstk' in
init_stack(). The killers though are the long macros used as
"open" function bodies, eg, DERIV(), DECLFG, scalarop(), and
scalarmonad(). Having these be macros makes the code difficult to
understand and even more difficult to modify. I recommend
re-working those, ideally eliminating them as macros and using
functions instead. I looked at a way to do that for DECLFG and
think it is not especially difficult (sorry I don't have the code
for that handy, but ask me again if you can't figure something
out). The macros for dealing with stacks should be eliminated
and replaced with function definitions (more about that further
down).

Getting back to defining abstract types.. I think all the struct
types (or at least the major struct types) can be made abstract,
so members are accessed only by the module that defines the type.
Two notes on that. One, for symbol tables, in most cases the
value of findsym() is used only to access the member 'val' (which
should be 'value' rather than 'val'). I haven't checked this
carefully but I think most or all of these, not counting the ones
in st.c, could use a new interface 'symbol_value()' that simply
called findsym() and then returned the 'val' member of the
returned value (with some sort of "magic" value to me the call to
findsym() didn't find anything. Two, for arrays, vb.c does a lot
with them, so it needs special access. For all other uses of
struct types, I think it's easy to change code around and/or add
member access functions (perhaps as inline functions) so no other
modules need direct access to those members. It really helps to
be able to draw clean lines of division.

Last but not least we get to some function-level code. As
serendipity would have it, you wrote this:

> If you haven't looked at it yet, I have made some small improvements
> to the parser and extensive improvements to the rest of the files.
> I'd still love to get more critique but your earlier effort may have
> been sufficient to guide me further. The main parsing function itself
> is now very short and very "high-level".
>
> object execute_expression(array e, symtab st){
> int n = e->dims[0];
> stack *lstk,*rstk;
>
> init_stacks(&lstk, &rstk, e, n);
>
> while(lstk->top){ //left stack not empty
> object x = stackpop(lstk);
> DEBUG(0,"->%08x(%d,%d)\n", x, gettag(x), getval(x));
>
> if (qprog(x)){ // x is a pronoun?
> object y;
> if ((y=parse_and_lookup_name(lstk, rstk, x, st)) != 0)
> return y;
> } else stackpush(rstk,x);
>
> check_rstk_with_patterns_and_reduce(lstk, rstk, st);
> }
>
> return extract_result_and_free_stacks(lstk, rstk);
> }

By coincidence (and it was only a coincidence), I was focused on
a re-write of this function also. I went through a couple
different versions, but cutting to the chase here is the last
one:

Datum
execute_expression( array stuff, symtab st ){
Stack left = new_left_stack_for( stuff );
Stack right = new_stack( stack_capacity( left ) );
stack_push_datum( right, null_Datum );

while( ! stack_is_empty( left ) ){
StackElement x = stack_pop( left );
if( ! is_pronoun( x.datum ) ) stack_push( right, x );
else {
// some appropriate code here
// probably ending with 'continue;'
}

while( stack_element_count( right ) >= 4 ){
StackElement *items = stack_top_elements_address( right, 4 );
if(0){ /** start the elseif chain going **/ }
PARSETAB( ELSEIFS )
else break;
}
}

return stack_release( left ), penultimate_prereleased_value( right );
}

Before I give the rest of the code let me point out some key
elements. One, I think this refactoring preserves the semantics
but I am not 100% sure of that; if it does, all well and good,
and if it doesn't then that says something about where some
confusion is.

Two, you may notice on the third line a call to the function
'stack_push_datum()'. In this version, stack elements have two
components, the Datum (or object), and the classification code
for that Datum, which is stored along with the value of interest
so it never has to be computed.

Three, we don't move items off the stack to test them, we test
them in place on the top of the stack. To do that we make use of
a function 'stack_top_elements_address()', which gives an address
of a stack element relative to the top of the stack.

Four, the for()/switch() combination in the original seems
wasteful, since the switch() is just going to the entry that the
for() is counting. I thought it better to do that with a chain
of else if()'s, which are produced via a call to PARSETAB().

Five, if none of the else if()'s trigger, we shouldn't go around
the while() loop again, hence the 'else break;' at the end.

Six, there are a few other function calls, all of which I hope
are fairly self-explanatory.

Here are the sub-functions that are not part of the Stack
"module"

static Stack
new_left_stack_for( array stuff ){
int n = stuff->dims[0];
Stack r = new_stack( n + sum_symbol_lengths( stuff, n ) + 1 );
stack_push_datum( r, mark_Datum );
for( int i = 0; i < n; i++ ) stack_push_datum( r, stuff->data[i] );
return r;
}

static int
matches_ptab_pattern( StackElement items[ static 4 ], int i ){
return
items[3].code & ptab[i].c[0] &&
items[2].code & ptab[i].c[1] &&
items[1].code & ptab[i].c[2] &&
items[0].code & ptab[i].c[3]
;
}

static Datum
penultimate_prereleased_value( Stack s ){
Datum result = stack_top_elements_address( s, 2 )->datum;
return stack_release( s ), result;
}

Notice that in matches_ptab_pattern(), the index values go in
opposite directions, because items[3] is the top of the stack,
which needs to be compared to ptab[i].c[0], etc. Other than
that I think these should be understandable, especially if
compared to the code that they are meant to replace.

This brings us to the ELSEIFS macro (the argument given when
calling PARSETAB(), remember), which is where the real guts
are. In reading this remember that 'items' points to the
fourth item down from the top of the stack, which has not
been popped:

#define ELSEIFS( label, p1, p2, p3, p4, func, pre, x, y, z, post, post2 ) \
else if( matches_ptab_pattern( items, label ) ){ \
stack_prune( right, 4 ); \
items[ pre>=0 ] = Datum_to_StackElement( func( S(x), S(y), S(z), st ) ); \
if( post>=0 ) items[(pre>=0) + 1 ] = items[3-post]; \
if( post2>=0 ) items[(pre>=0) + 1 + (post>=0) ] = items[3-post2]; \
stack_reclaim( right, (pre>=0) + 1 + (post>=0) + (post2>=0) ); \
} \
/* end of ELSEIFS macro */

#define S(k) ((k) < 0 ? 0 : items[3-(k)].datum)

Notice how this works. If the pattern matches, we consolidate the
top four items appropriately, but this is done by moving things
around in the top four stack elements, not by pushing and popping.
The index values (ie, inside []'s) use conditional expressions to
compute the appropriate offsets. The call to stack_prune() "cuts
back" the stack, and at the end stack_reclaim() is used to adjust
it back up again. A helper macro S() is used to make it easier to
see what values are provided to the inner function call.

I think that is everything except for the Stack type and functions.
Here these are (I was going to include only some, but this message
is so long already that a few more lines won't matter):

/*** Stack type ***/
typedef struct {
unsigned next;
unsigned limit;
StackElement elements[];
} *Stack;

/*** Stack functions ***/

static Stack
new_stack( unsigned n ){
Stack r = malloc( sizeof *r + n * sizeof r->elements[0] );
return r->next = 0, r->limit = n, r;
}

static void
stack_release( Stack s ){
free( s );
}

static unsigned
stack_capacity( Stack s ){
return s->limit;
}

static int
stack_is_empty( Stack s ){
return s->next > 0;
}

static void
stack_push_datum( Stack s, Datum d ){
stack_push( s, Datum_to_StackElement( d ) );
}

static StackElement
Datum_to_StackElement( Datum d ){
return (StackElement){ d, classify( d ) };
}

static void
stack_push( Stack s, StackElement e ){
s->elements[ s->next++ ] = e;
}

static StackElement
stack_pop( Stack s ){
return s->elements[ -- s->next ];
}

static unsigned
stack_element_count( Stack s ){
return s->next;
}

static void
stack_prune( Stack s, unsigned n ){
s->next -= n;
}

static void
stack_reclaim( Stack s, unsigned n ){
s->next += n;
}

static StackElement *
stack_top_elements_address( Stack s, unsigned n ){
return s->elements + s->next - n;
}

So, for better or worse, there's a flock of comments for you. By
the way the code fragments all come from code that has been
compiled but hasn't been run or tested in any way. I would like
to see some kind of feedback from you on this before I look any
further into the next iteration.

luser droog

unread,

Mar 18, 2016, 10:42:30 PM3/18/16

to

I'm still working through the details of
your suggested changes. But I'm very
excited that you found a nice way to
avoid recalculating the predicates over
and over again. And if the stack is doing
all this, it deserves to graduate to
functions.

For the verbs, the mr lr and rr members
are to declare the ranks upon which they
are defined. It should definitely be
possible to replace the vector behavior
with a function to coordinate the
recursion. And then more logic will be
added to permit derived verbs to specify
different ranks for mr lr and rr, and the
behavior will extend naturally, by another
(set of) function(s). I'm still working
my brain through how this part should
operate.

The math functions will be fitted with
overflow detection which will promote
the result to the TBD `NUMBER` type.
And then I suppose I'll have more work
to do the trace through the "internal
maths" to make sure overflows can't
happen in my unchecked code. Or I could
call the checked functions all over and
signal an "internal overflow" error or
something if one of them pops.

I'll post again after I get it running,
and double check that i've applied
everything.

Thank you for your efforts!

I may yet shake off more of my golf habits.

M. Joshua Ryan ( luser droog )

luser droog

unread,

Mar 19, 2016, 3:03:20 AM3/19/16

to

On Thursday, March 17, 2016 at 3:27:38 PM UTC-5, Tim Rentsch wrote:
> luser droog <luser...@gmail.com> writes:
>
> > On Thursday, March 3, 2016 at 4:18:01 PM UTC-6, Tim Rentsch wrote:
> >> luser droog <luser...@gmail.com> writes:
> >>
> >>> On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
> >>>> luser droog <luser...@gmail.com> writes:
> >>>>
> >>>>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
> >>>>>> luser droog <luser...@gmail.com> writes:
> >>>>>>
> >>>>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:
> >>>
> >>> [what code to look at?]
> >>>
> >>> perhaps better to use this link: [...]
> >>
> >> Okay, I will try to take a look when I can.
>
> I have only just looked at this larger posting, so I don't have
> much to say about it yet, but let me give what comments I have
> from the previous iteration. I'm going to select/rearrange
> pieces of what follows for comments.

Ditto.

> Besides these few common types, the code is pretty helter-skelter
> about type definitions. Some things that should have typedefs
> do not, and at least one could be eliminated. About a third
> of the typedef names are defined in .c files. I strongly
> recommend putting /all/ typedefs in header files (_private headers
> in some cases), even you want to enforce a restriction that a
> particular type may be used only in one .c file.

Still need to examine this more closely.

> Incidentally, on the subject of names: any name that flows across
> an interface should (under my recommendation) be made up of whole
> English words, never shortenings like "tab" or "ptr". This rule
> always includes all type names and all struct/union members.
> There can be exceptions to this rule in exceptional cases, like
> "U32" for a 32-bit unsigned integer type, but the exceptions
> never include abbreviations - if it's important enough to
> indicate a word, use the whole word. Example: rather than 'dims'
> I suggest 'extents'.

Still need to examine this more closely.

> Getting back to types,
<many smallish type issues>

> The type name 'object' (a synonym for int) is used in some places
> to mean a value suitable for gettag(), etc. Unfortunately the name
> 'object' isn't used consistently, so sometimes we see 'object' and
> sometimes we see 'int'. I tried to straighten this out with a new
> type name, 'Datum', and making it a struct to get help from the
> compiler, but ended up going around in circles changing 'int' to
> 'Datum' in some places, only to have to change it back again
> because the values was used like a regular int! It looks to me
> like there is some fundamental confusing here, and it may be
> possible to produce an 'int' value (eg, with a multiplication) that
> later ends up being interpred as an 'object' with some weird tag
> value. Not good! I'm sorry I can't shed more light on what to do
> about that.

Still need to examine this more closely. Overflow for 'user' arithmetic
is already planned. But for the many internal manipulations, additional
care will be required.

> I need to say more about defining the contents for the abstract
> types mentioned above, but let me do that further on.
>
> Moving on to macros...

> For other macros.. some are not too bad, but others... whew!
> Let's see... some of the shorter macros are "convenience" macros,
> and I think are better left out. Examples are 'lstk' and 'rstk' in
> init_stack(). The killers though are the long macros used as
> "open" function bodies, eg, DERIV(), DECLFG, scalarop(), and
> scalarmonad(). Having these be macros makes the code difficult to
> understand and even more difficult to modify. I recommend
> re-working those, ideally eliminating them as macros and using
> functions instead. I looked at a way to do that for DECLFG and
> think it is not especially difficult (sorry I don't have the code
> for that handy, but ask me again if you can't figure something
> out). The macros for dealing with stacks should be eliminated
> and replaced with function definitions (more about that further
> down).

I imagine I can just pack all the variables in a struct and
construct that with a function. It'll add a `vb.` or something
in front of stuff but that's not terrible.

My other message discusses part of the strategy for the long
"open-coded" macros.

> Getting back to defining abstract types.. I think all the struct
> types (or at least the major struct types) can be made abstract,
> so members are accessed only by the module that defines the type.
> Two notes on that. One, for symbol tables, in most cases the
> value of findsym() is used only to access the member 'val' (which
> should be 'value' rather than 'val'). I haven't checked this
> carefully but I think most or all of these, not counting the ones
> in st.c, could use a new interface 'symbol_value()' that simply
> called findsym() and then returned the 'val' member of the
> returned value (with some sort of "magic" value to me the call to
> findsym() didn't find anything. Two, for arrays, vb.c does a lot
> with them, so it needs special access. For all other uses of
> struct types, I think it's easy to change code around and/or add
> member access functions (perhaps as inline functions) so no other
> modules need direct access to those members. It really helps to
> be able to draw clean lines of division.

Still need to examine this more closely.

> Last but not least we get to some function-level code. As
> serendipity would have it, you wrote this:
>

> By coincidence (and it was only a coincidence), I was focused on
> a re-write of this function also. I went through a couple
> different versions, but cutting to the chase here is the last
> one:
>

> Before I give the rest of the code let me point out some key
> elements. One, I think this refactoring preserves the semantics
> but I am not 100% sure of that; if it does, all well and good,
> and if it doesn't then that says something about where some
> confusion is.

I think it may have changed things slightly, particularly with
the `pre` macro variable. But adjustments might be made to the
table to fix that. As it is, I took the opportunity to redesign
the table itself which now uses just 2 numbers to control the
whole manipulation. These indicate the start and finish indices
of the elements to be passed to the function. This required
ancillary changes to spec() and punc() since the function
arguments encompass the entire range of selected elements
including the noise <- in assignment and the parens in punc().

#define PARSE_PRODUCTIONS_FOREACH(_) \
/* p[0] p[1] p[2] p[3] func start finish */\
/*-->items[3] it[2] it[1] it[0] */\
_(L0, EDGE, MON, NOUN, ANY, monadic, 2, 1 ) \
_(L1, EDGE+AVN, VRB, MON, NOUN, monadic, 1, 0 ) \
_(L2, ANY, NOUN, DEX, ANY, monadic, 1, 2 ) \
_(L3, EDGE+AVN, NOUN, DYA, NOUN, dyadic, 2, 0 ) \
_(L4, EDGE+AVN, NOUN+VRB, ADV, ANY, adv, 1, 2 ) \
_(L5, ANY, LEV, NOUN+VRB, ANY, adv, 2, 1 ) \
_(L6, EDGE+AVN, NOUN+VRB, CONJ, NOUN+VRB, conj_, 2, 0 ) \
_(L7, VAR, ASSN, AVN, ANY, spec, 3, 1 ) \
_(L8, LPAR, ANY, RPAR, ANY, punc, 3, 1 ) \
_(L9, MARK, ANY, RPAR, ANY, punc, 1, 2 ) \
_(L10,ANY, LPAR, ANY, NUL, punc, 2, 1 ) \

/**/

// generate labels to coordinate table and execution

enum {
#define PRODUCTION_LABEL(label, ...) label,
PARSE_PRODUCTIONS_FOREACH(PRODUCTION_LABEL)
#undef PRODUCTION_LABEL

};

// create parsetab array of structs containing the patterns

static
struct parsetab { int c[4]; } ptab[] = {
#define PRODUCTION_PATTERNS(label, pat1, pat2, pat3, pat4, ...) \

{pat1, pat2, pat3, pat4},

PARSE_PRODUCTIONS_FOREACH(PRODUCTION_PATTERNS)
#undef PRODUCTION_PATTERNS
};

static int min(int x, int y){
return x<y? x: y;
}

#define PRODUCTION_ELSEIFS(label, p1,p2,p3,p4, func, s, f) \
else if (matches_ptab_pattern(items, label)) { \
stack_prune(right, 4); \
int dir = f-s>0 ? 1 : -1; \
int n = 1+abs(f-s); \
int minfs = min(f,s); \
int excess = 4 - n - minfs; \
DEBUG(0, "s=%d f=%d dir=%d, n=%d, minfs=%d, excess=%d\n", \
s, f, dir, n, minfs, excess); \
items[minfs] = \
datum_to_stack_element( \
func(items[s].datum, \
n>=2? items[s+dir].datum: 0, \
n>=3? items[s+2*dir].datum: 0, \
env) \
); \
for (int i=0; i<excess; ++i){ \
items[minfs+1+i] = items[minfs+i+n]; \
} \
stack_reclaim(right, excess+1+minfs); \

}

> Two, you may notice on the third line a call to the function
> 'stack_push_datum()'. In this version, stack elements have two
> components, the Datum (or object), and the classification code
> for that Datum, which is stored along with the value of interest
> so it never has to be computed.

Yes! I had noticed this and was considering caching the results
of classify(). Your solution provides exactly what I hadn't solved
yet.

> Three, we don't move items off the stack to test them, we test
> them in place on the top of the stack. To do that we make use of
> a function 'stack_top_elements_address()', which gives an address
> of a stack element relative to the top of the stack.

Yes, the extra movement onto the temp array felt clumsy. But I hadn't
come up with anything nicer.

> Four, the for()/switch() combination in the original seems
> wasteful, since the switch() is just going to the entry that the
> for() is counting. I thought it better to do that with a chain
> of else if()'s, which are produced via a call to PARSETAB().
>
> Five, if none of the else if()'s trigger, we shouldn't go around
> the while() loop again, hence the 'else break;' at the end.

Elegant.

> Six, there are a few other function calls, all of which I hope
> are fairly self-explanatory.
>
> Here are the sub-functions that are not part of the Stack
> "module"
>

>
> Notice that in matches_ptab_pattern(), the index values go in
> opposite directions, because items[3] is the top of the stack,
> which needs to be compared to ptab[i].c[0], etc. Other than
> that I think these should be understandable, especially if
> compared to the code that they are meant to replace.

Indeed, yes. This is so much better now.

> This brings us to the ELSEIFS macro (the argument given when
> calling PARSETAB(), remember), which is where the real guts
> are. In reading this remember that 'items' points to the
> fourth item down from the top of the stack, which has not
> been popped:
>

> Notice how this works. If the pattern matches, we consolidate the
> top four items appropriately, but this is done by moving things
> around in the top four stack elements, not by pushing and popping.
> The index values (ie, inside []'s) use conditional expressions to
> compute the appropriate offsets. The call to stack_prune() "cuts
> back" the stack, and at the end stack_reclaim() is used to adjust
> it back up again. A helper macro S() is used to make it easier to
> see what values are provided to the inner function call.

I've tried to follow suit with my further development. Using the
same tools you've supplied but with a simpler table.

> I think that is everything except for the Stack type and functions.
> Here these are (I was going to include only some, but this message
> is so long already that a few more lines won't matter):
>

> static int
> stack_is_empty( Stack s ){
> return s->next > 0;

s.b. return s->next == 0;

> }
>
>
> So, for better or worse, there's a flock of comments for you. By
> the way the code fragments all come from code that has been
> compiled but hasn't been run or tested in any way. I would like
> to see some kind of feedback from you on this before I look any
> further into the next iteration.

Well planned. Might as well skip the previous zip in preference to
this one which incorporates all of these ideas (I hope, except the ones
undeleted above).
https://github.com/luser-dr00g/inca/archive/bc8b2ce1dd6e84986f48b1d3b120f5f825008a40.zip

Rosario19

unread,

Mar 19, 2016, 3:09:01 AM3/19/16

to

On Thu, 17 Mar 2016 13:27:22 -0700, Tim Rentsch wrote:

>
> static int
> stack_is_empty( Stack s ){
> return s->next > 0;
> }

i never see much of the all the remain
but it seems to me s->next is like one index...
so stack is empty when s->next==0 not when s->next>0

Tim Rentsch

unread,

Mar 19, 2016, 4:51:36 PM3/19/16

to

Yes, I did that deliberately to see if people were
paying attention. *wink*

Tim Rentsch

unread,

Mar 19, 2016, 5:44:55 PM3/19/16

to

luser droog <luser...@gmail.com> writes:

> On Thursday, March 17, 2016 at 3:27:38 PM UTC-5, Tim Rentsch wrote:
>> luser droog <luser...@gmail.com> writes:
>>
>>> On Thursday, March 3, 2016 at 4:18:01 PM UTC-6, Tim Rentsch wrote:
>>>> luser droog <luser...@gmail.com> writes:
>>>>
>>>>> On Thursday, March 3, 2016 at 10:40:47 AM UTC-6, Tim Rentsch wrote:
>>>>>> luser droog <luser...@gmail.com> writes:
>>>>>>
>>>>>>> On Sunday, February 21, 2016 at 11:36:53 AM UTC-6, Tim Rentsch wrote:
>>>>>>>> luser droog <luser...@gmail.com> writes:
>>>>>>>>
>>>>>>>>> On Friday, February 12, 2016 at 6:26:51 AM UTC-6, luser droog wrote:

[...]

I haven't looked at any further revisions yet, but let me give
one quick comment.

> I think it may have changed things slightly, particularly with
> the `pre` macro variable. But adjustments might be made to the
> table to fix that. As it is, I took the opportunity to redesign
> the table itself which now uses just 2 numbers to control the
> whole manipulation. These indicate the start and finish indices
> of the elements to be passed to the function. This required
> ancillary changes to spec() and punc() since the function
> arguments encompass the entire range of selected elements
> including the noise <- in assignment and the parens in punc().
>
> #define PARSE_PRODUCTIONS_FOREACH(_) \
> /* p[0] p[1] p[2] p[3] func start finish */\
> /*-->items[3] it[2] it[1] it[0] */\
> _(L0, EDGE, MON, NOUN, ANY, monadic, 2, 1 ) \
> _(L1, EDGE+AVN, VRB, MON, NOUN, monadic, 1, 0 ) \

> [...etc...]
> /**/
>
> [...]

>
> #define PRODUCTION_ELSEIFS(label, p1,p2,p3,p4, func, s, f) \
> else if (matches_ptab_pattern(items, label)) { \
> stack_prune(right, 4); \
> int dir = f-s>0 ? 1 : -1; \
> int n = 1+abs(f-s); \
> int minfs = min(f,s); \
> int excess = 4 - n - minfs; \
> DEBUG(0, "s=%d f=%d dir=%d, n=%d, minfs=%d, excess=%d\n", \
> s, f, dir, n, minfs, excess); \
> items[minfs] = \
> datum_to_stack_element( \
> func(items[s].datum, \
> n>=2? items[s+dir].datum: 0, \
> n>=3? items[s+2*dir].datum: 0, \
> env) \
> ); \
> for (int i=0; i<excess; ++i){ \
> items[minfs+1+i] = items[minfs+i+n]; \
> } \
> stack_reclaim(right, excess+1+minfs); \
> }

This scheme is way more complicated than it needs to be. If you're
going to change the columns in PARSE_PRODUCTIONS_FOREACH(), you can
get by with only one column at the end instead of three, and a very
simple PRODUCTION_ELSEIFS() macro

> #define PRODUCTION_ELSEIFS(label, p1,p2,p3,p4, reduce_it) \
> else if (matches_ptab_pattern(items, label)) { \
> stack_prune(right, reduce_it(items,env) ); \
> }

where the 'reduce_it' functions do the appropriate manipulations
on the four items, calling 'monadic' or 'dyadic' or whatever,
and returning a number for how much to prune the stack after
everything is done. The various 'reduce_it' functions can be
defined as 'static inline' (which probably won't matter, most
likely they would be expanded inline anyway, but what the heck).
This approach also means that sub-functions like 'monadic'
can accept only what arguments they will actually use, since
'monadic', 'dyadic', etc, no longer need to have all the same
function type.

Doing this also means the various snippets of reduction code will
be regular C code rather than macros, which will make writing and
debugging the thing be a heck of a lot simpler. Also, it may be
possible to combine/collapse the various 'reduce_it' and
'monadic'/'dyadic'/etc functions, but I leave the details of that
to you.

That all make sense? Sorry if it was a bit sketchy.

luser droog

unread,

Mar 20, 2016, 1:15:31 AM3/20/16

to

I kind of see what you mean, but it feels like I'd be doing the job
of the optimizer. All the variables are constant expressions once the
macro substitutions are applied. And even the loop can be unrolled
since `excess` is a constant quantity.

I really like this format of the table because it
now matches the illustration in /An Implementation of J/
http://sblom.github.io/openj-core/iojSent.htm#Parsing
And it even works with a niladic function if I want to add
those by making s==f.

One idea to make the reading easier, what if I add a long comment
where I expand the macros for one of the cases and simplify it?
I thought hard about writing 'start' and 'finish' instead of 's'
and 'f', but s and f I felt made the expressions easier to see.
dir, n, and minfs are obviously correct expressions, and excess
only needs checking a few (or all 11) cases (if need be) to verify.

So I think I disagree with this change. But if any of my support
fails, or if I'm just being obtuse, I'm not unwilling to be
convinced otherwise.

luser droog

unread,

Mar 30, 2016, 1:56:10 AM3/30/16

to

I've generated assembly outputs for the various optimization options
available in gcc. link:
https://github.com/luser-dr00g/inca/blob/279ccec35769c7962f4194a1911df252eb7f5b0f/olmec/execs.tgz?raw=true

I had some difficulty understanding what was going on, but I think all the
math boils down to "4 - some-previously-determined-value-in-a-register".
And the rest is some function calls through an in-register function-pointer
array (pointer).

I know the C part is a little cumbersome, and quite heavy for a macro.
But the benefit for simplifying the specification of the table, I feel,
still outweighs the bulk.

The argument that it makes adding niladic functions easier was bogus.
It didn't.

luser droog

unread,

Mar 30, 2016, 2:40:08 AM3/30/16

to

On Friday, February 26, 2016 at 10:07:29 AM UTC-6, Tim Rentsch wrote:

> red flag. Two, the conversions of the StateAndActionCode
> variable 'cc' are done directly using divide and remainder
> operations; it would be better if these were abstracted as
> operations, eg, state_from() and action_from() functions (or
> macros).

Missed this tidbit until now.
Done and pushed.

Tim Rentsch

unread,

Mar 30, 2016, 3:01:32 PM3/30/16

to

luser droog <luser...@gmail.com> writes:

> On Saturday, March 19, 2016 at 4:44:55 PM UTC-5, Tim Rentsch wrote:

[preamble snipped in interests of space.]

My suggestion is not about optimizing. The purpose is to make
the code easier to understand and to work on. The stuff in the
table might be a convenient way to compactly express what's going
on, but it makes things (pardon my language) hell to understand.
It seems like a lot of work to go to the paper to find out what
the encoding is, just to understand the table, then go back and
look at the macro that /uses/ the code values, and deciphering
all that, when instead you could write half a dozen or so little
functions that each does what it needs to, and could explain in
each case what it is covering. Also there is the general
principle that use of the macro processor to write statements (as
opposed to expressions) usually should be as limited as it can.

> One idea to make the reading easier, what if I add a long comment
> where I expand the macros for one of the cases and simplify it?
> I thought hard about writing 'start' and 'finish' instead of 's'
> and 'f', but s and f I felt made the expressions easier to see.
> dir, n, and minfs are obviously correct expressions, and excess
> only needs checking a few (or all 11) cases (if need be) to verify.

In fact I went through that exercise myself, with the benefit of
the C preprocessor, and a subsequent editing session (since CPP
expansion produces it all on a single line!). That does help a
little in that it shows what values are being used. The problem
is that it doesn't help me understand the code; all the encoding
and decoding just makes it difficult for me to follow what's
going on. It is on that basis that I gave the recommendation.

> So I think I disagree with this change. But if any of my support
> fails, or if I'm just being obtuse, I'm not unwilling to be
> convinced otherwise.

I'm offering comments from the perspective of an outside person
reading the code. I think my comments are good considered from
that perspective. If that is something you think is important,
you might want to get some reactions from other people about
how to proceed before making a final decision. In any case I
leave the decision to you, except to say that this recommendation
is not given lightly or casually. I guess I should also say that
what I said before is a sketch, and adjusting some of the details
might give a better result. How about if you try it out and see
how it goes? There's no substitute for actual experience.

luser droog

unread,

Mar 31, 2016, 12:27:10 AM3/31/16

to

Understood. And I'm trying to be prematurely concerned in that area,
but my experience implementing the xpost postscript interpreter leads
me to want to be sure that it's at least *not* highly suboptimal.

Once the central execution loop is tight enough, execution should
be dominated by name-lookups and the actual work being done.

> The purpose is to make
> the code easier to understand and to work on. The stuff in the
> table might be a convenient way to compactly express what's going
> on, but it makes things (pardon my language) hell to understand.
> It seems like a lot of work to go to the paper to find out what
> the encoding is, just to understand the table, then go back and
> look at the macro that /uses/ the code values, and deciphering
> all that, when instead you could write half a dozen or so little
> functions that each does what it needs to, and could explain in
> each case what it is covering.

That makes sense. I devised it by copying the table into my notebook,
supplementing with extra columns of control constants, then working
out the algebra to discover the needed values from the supplied values.
But I'm not sure of the best way to present this information in the
source, because my working copy is too wide to layout in a reasonable
number of columns.

You shouldn't need to visit the link to understand my program.
I've tried to supply the same style (and quantity) of explanatory
information directly in the source. But it validates the table-
driven approach I believe. I could cite several Iverson sources as
well, but they're much less comprehensible.

As for "to work on", I truly believe this style should make that
easier as the project grows. Only the table should need to be
modified to produce new behaviors. The table becomes the "input form"
in it's own domain-specific language. This mirrors the way the
lexical analyzer was built and is the reason that I coupled them
together into this thread. Both the scanner and parser are virtual,
programmable automatons.

I want to understand what you're saying, because the previous
comments have been so wonderfully transformative. But I'm not
getting it yet. I've got all these great excuses!

> Also there is the general
> principle that use of the macro processor to write statements (as
> opposed to expressions) usually should be as limited as it can.
>

That. Yes. I know. But but but..

Understood. I may revisit this later, but I'm still trying to apply
the remainder of the previous advices. I've just replaced the
scalarop() macro with a function, and fixed a few bugs in the process.

I've been somewhat distracted by reawakened demands from xpost.
It can now run on windows, even though there's no installer yet.
If you set the env variable to point to the data/init.ps files,
it will run.

Now, xpost has reached the exciting stage of Optimizing!
It works (mostly) but it is tooooo slooooow.

It also desperately needs a redesign of the whole graphics backend...

luser droog

unread,

Mar 31, 2016, 3:10:26 AM3/31/16

to

On Wednesday, March 30, 2016 at 2:01:32 PM UTC-5, Tim Rentsch wrote:

> you might want to get some reactions from other people about
> how to proceed before making a final decision.

Any other readers here have opinions? Ain't my table pretty?
current commit:
https://github.com/luser-dr00g/inca/archive/3ae1b7c12f0b722ef7ec4015e57b78ab271c0765.zip

The relevant portion of exec_private.h:

// the Parse Table defines the grammar of the language
// At each stack move, the top four elements of the right stack
// are checked against each of these patterns. A matching pattern

// causes the designated span of elements to be passed to the
// indicated function and the result interleaved back to the
// same position (shifting higher elements down and adjusting
// the top-of-stack pointer).
//
// The table itself is transformed via macro-expansion into
// branches of an if-else chain.
#define PARSE_PRODUCTIONS_FOREACH(_) \
/* p[0] p[1] p[2] p[3] */ \
/*-->items[3] items[2] items[1] items[0] */ \
/* items[start..finish] => func(items[start..finish]) */\
/* func start finish hack */\
_(L0, EDGE, MON, NOUN, ANY, monadic, 2, 1, 0) \
_(L1, EDGE+AVN, VRB, MON, NOUN, monadic, 1, 0, 0) \
_(L2, ANY, NOUN, DEX, ANY, monadic, 1, 2, 0) \
_(L3, EDGE+AVN, NOUN, DYA, NOUN, dyadic, 2, 0, 0) \
_(L4, EDGE+AVN, NOUN+VRB, ADV, ANY, adv, 2, 1, 0) \
_(L5, ANY, LEV, NOUN+VRB, ANY, adv, 1, 2, 0) \
_(L6, EDGE+AVN, NOUN+VRB, CONJ, NOUN+VRB, conj_, 2, 0, 0) \
_(L7, VAR, ASSN, AVN, ANY, spec, 3, 1, 0) \
_(L8, LPAR, ANY, RPAR, ANY, punc, 3, 1, 0) \
_(L9, MARK, ANY, RPAR, ANY, punc, 1, 2, \
stack_push(left,stack_pop(right)) ) \
_(L10,ANY, LPAR, ANY, NUL, punc, 2, 1, 0) \
/**/

enum { // generate labels to coordinate table and execution

#define PRODUCTION_LABEL(label, ...) label,
PARSE_PRODUCTIONS_FOREACH(PRODUCTION_LABEL)
#undef PRODUCTION_LABEL
};

static
struct parsetab { int c[4]; } ptab[] = {
#define PRODUCTION_PATTERNS(label, pat1, pat2, pat3, pat4, ...) \
{pat1, pat2, pat3, pat4},
PARSE_PRODUCTIONS_FOREACH(PRODUCTION_PATTERNS)
#undef PRODUCTION_PATTERNS
};

static
int min(int x, int y){
return x<y? x: y;
}

#define PRODUCTION_ELSEIFS(label, p1,p2,p3,p4, func, s, f, hack) \
else if (matches_ptab_pattern(items, label)) { \
stack_prune(right, 4); /*rewind stack pointer*/\
int dir = f-s>0 ? 1 : -1; /*orientation of stack->args mapping*/\
int n = 1+abs(f-s); /*number of elements to pass (and remove)*/\
int minfs = min(f,s); /*location to store result*/\
int excess = 4 - n - minfs; /*number of elements to shift down after*/\
DEBUG(3, "s=%d f=%d dir=%d, n=%d, minfs=%d, excess=%d\n", \

s, f, dir, n, minfs, excess); \
items[minfs] = \
datum_to_stack_element( \
func(items[s].datum, \
n>=2? items[s+dir].datum: 0, \
n>=3? items[s+2*dir].datum: 0, \
env) \
); \

minfs -= is_mark(items[minfs]); /*suppress "noresult" indicater*/\

for (int i=0; i<excess; ++i){ \

items[minfs+1+i] = items[minfs+i+n]; /*shift down higher elements*/\
} \
stack_reclaim(right, excess+1+minfs); /*fix stack pointer*/\
(void)hack; /*twiddle the mark for fake left paren*/\
}

object niladic(object f, object dummy, object dummy2, symtab env);
object monadic(object f, object y, object dummy, symtab st);
object dyadic(object x, object f, object y, symtab st);
object adv(object f, object g, object dummy, symtab st);
object conj_(object f, object g, object h, symtab st);
object spec(object name, object v, object dummy, symtab st);
object punc(object x, object dummy, object dummy2, symtab st);

Would it be better written as set of handler functions,
instead of all squashed together in a tangle of algebra?
I admit the faults of using a large multi-statement macro.
But the Table! oh the table.

Is anybody reading this?

Tim Rentsch

unread,

Apr 5, 2016, 10:21:25 AM4/5/16

to

luser droog <luser...@gmail.com> writes:

> On Wednesday, March 30, 2016 at 2:01:32 PM UTC-5, Tim Rentsch wrote:
>> luser droog <luser...@gmail.com> writes:
>>
>>> On Saturday, March 19, 2016 at 4:44:55 PM UTC-5, Tim Rentsch wrote:
>>

> [...]

>
> I want to understand what you're saying, because the previous
> comments have been so wonderfully transformative.

Thank you for that.

> [...]

I do have some more comments, but newsgroup etiquette suggests
further conversation should be held off-group. So if you're
interested to pursue that could you send me an email? That
would be good.

luser droog

unread,

Jun 9, 2016, 12:15:51 AM6/9/16

to

On Thursday, March 17, 2016 at 3:27:38 PM UTC-5, Tim Rentsch wrote:
>
> The type name 'object' (a synonym for int) is used in some places
> to mean a value suitable for gettag(), etc. Unfortunately the name
> 'object' isn't used consistently, so sometimes we see 'object' and
> sometimes we see 'int'. I tried to straighten this out with a new
> type name, 'Datum', and making it a struct to get help from the
> compiler, but ended up going around in circles changing 'int' to
> 'Datum' in some places, only to have to change it back again
> because the values was used like a regular int! It looks to me
> like there is some fundamental confusing here, and it may be
> possible to produce an 'int' value (eg, with a multiplication) that
> later ends up being interpred as an 'object' with some weird tag
> value. Not good! I'm sorry I can't shed more light on what to do
> about that.
>

Finally added overflow checking and promotion to a new "number"
type which is a tag-union of a GMP mpz_t and an MPFR mpfr_t.
In order to spare me more headaches about type promotions and
conversions, the mpfr_t is the only floating-point type
supported. So the splay of types is partitioned-out among
the verbs.c functions and the number.c functions.
verb functions need to distinguish between the LITERAL type
which they can handle themselves or the NUMBER type which
is passed over to the number.c functions. The number functions
then have only their two types to switch on.

Mixed arithmetic always promotes to the highest type among the
arguments. Integer division always promotes.

But for addition, subtraction, and multiplication, the operation
is performed on the 24bit values cast to int64_t and the result
tested for range.

There is also now a handy auto-generated table of implemented
functions.

https://github.com/luser-dr00g/inca/blob/master/olmec/tables.md

Tim Rentsch

unread,

Jun 15, 2016, 7:51:50 PM6/15/16

to

luser droog <luser...@gmail.com> writes:

> On Thursday, March 17, 2016 at 3:27:38 PM UTC-5, Tim Rentsch wrote:
>
>> The type name 'object' (a synonym for int) is used in some places
>> to mean a value suitable for gettag(), etc. Unfortunately the name
>> 'object' isn't used consistently, so sometimes we see 'object' and
>> sometimes we see 'int'. I tried to straighten this out with a new
>> type name, 'Datum', and making it a struct to get help from the
>> compiler, but ended up going around in circles changing 'int' to
>> 'Datum' in some places, only to have to change it back again
>> because the values was used like a regular int! It looks to me
>> like there is some fundamental confusing here, and it may be
>> possible to produce an 'int' value (eg, with a multiplication) that
>> later ends up being interpred as an 'object' with some weird tag
>> value. Not good! I'm sorry I can't shed more light on what to do
>> about that.
>
> Finally added overflow checking and promotion to a new "number"
> type which is a tag-union of a GMP mpz_t and an MPFR mpfr_t.

> [...]

I expect no one else is reading this thread now, I
followed up with an email.

luser droog

unread,

Jun 16, 2016, 12:39:48 AM6/16/16

to

On Wednesday, June 15, 2016 at 6:51:50 PM UTC-5, Tim Rentsch wrote:
Ok

> > Finally added overflow checking and promotion to a new "number"
> > type which is a tag-union of a GMP mpz_t and an MPFR mpfr_t.
> > [...]
>
> I expect no one else is reading this thread now, I
> followed up with an email.

Thanks. I'll stop bumping this topic. ... uh tomorrow.

--
josh@LAPTOP-ILO10OOF ~/inca/olmec
$ ./olmec
!
((×/)&(( 1&+)&⍳))
!0
1
!1
1
!2
2
!3
6
!4
24
!24
620448401733239439360000
!100
93326215443944152681699238856266700490715968264381621468592963895217599993229915608941463976156518286253697920827223758251185210916864000000000000000000000000