/* * halign - program to produce lined-up columns * * halign reads input in the form of columns of data separated by * whitespace; the output is similar except that all the columns line * up. This was inspired by the common uses of the \halign command in * TeX; knowing how TeX's \halign command works will make this program * almost self-evident. * * Usage is * * halign [-b] [-Bstr] [-C] [-.] [-r] [-s] [-t] [-Fc] [-jRE] [-xRE] * [ [c[lr]lrx]{C,.,px,Pxy}[n] | -m | -o | -O ] ... * * where each argument (except the flags) corresponds to one column in * the input. * * Quick summary of options: * -b leading blanks are part of first field * -Bstr first field extends up through str * -C apply C flag to each field * -. apply . flag to each field * -r rest-of-line style last field * -s don't strip trailing spaces * -t same as -F * -Fc field separator is c * -F field separator is whitespace * -jRE process lines matching RE ("just"), skip others * -xRE process lines not matching RE ("except"), skip others * c[lr], l, r, x center[left/right], left, right, omit * {C,.,px,Pxy} optional flags for fields: * C C-style quoting allowed in this field * . Align on decimal point, not field edges * px This field pads with x rather than spaces * Pxy This field pads with x and y rather than spaces, * x on left, y on right. * [n] minimum column width * -m mark position for repeating argument pattern * -o specify output column separator string * -O specify default output column separator string * * The letter indicates how the column is to be justified; r indicates * flush-right, l indicates flush-left, and c indicates centered. Of * course, centering cannot be perfect without half-character * resolution; if an item cannot be centered exactly the extra space * will go on the right by default. If desired, this can be * controlled by giving l or r as well as c (ie, cl or cr - not lc or * rc); items that cannot be centered exactly will be placed to that * side of their precise positioning (c is thus equivalent to cl). If * x is specified, the column will be omitted from the output * entirely. * * The field flags affect only the field they are applied to. Flags * can be applied to all fields by giving them as command-line * options; when this is done, the meaning of the flag character in * the field is reversed, to allow field-specific disabling of * globally-applied flags. * * Flag Meaning * * C C-style quoting is acepted in this field. Double and * single quotes "..." and '...' quote everything inside * them; in particular, field separators will not be * recognized inside quotes. This option also understands * \ escapes, \" and \' in particular. In accord with C, * even a newline can appear provided it is backslashed, * though this will throw off the aligning of the columns. * * . This field is to be treated as numbers and aligned * correspondingly. The field contents are examined, * looking for a . character; the first one found is used * as the alignment point (the dots s will appear above * one another). If no dot is found, the field is aligned * as if it had a dot appended to it, but this "virtual" * dot does not actually appear in the output. The * placement character (l/r/c) is ignored unless a field * width is given that is wider than the column that * results, in which case the placement character controls * the placement of the lined-up numbers within the field. * The fields are not checked for conformance to any * format; the only thing that is looked for is the . * character. (There is no way to line up on an arbitrary * character instead of a dot.) * * px When padding this field to its output width, use x * instead of spaces for padding. x may be any single * character (except a NUL). * * Pxy When padding this field to its output width, use x or y * instead of spaces for padding; x is used on the left of * the field contents, y on the right. (For fields * aligned left or right, one of the characters will never * be used.) * * The number n, if given, indicates a minimum width for the column. * The default is to make the column just wide enough to contain the * widest entry; if this would result in a column narrower than n * characters, the column will be n characters wide anyway. * * Normally, input columns are separated by any amount of whitespace. * The -Fc option can be used to change this to any character c; doing * this will permit empty columns. -t is an abbreviation for the * common case of a tab as a separator. If -F is given without any * character following, it restores the default behavior (any amount * of whitespace is a separator). * * When the input field delimiter is the default (ie, whitespace), * leading whitespace on each input line is stripped. The -b option * preserves this whitespace, making it part of the first field. -B * can be used to include everything up to and including a given * string as (part of) the first field. * * The -r option tells halign not to discard extra fields; instead, * once halign starts reading the last field it will ignore field * delimiters until the newline at the end of the line. That is, -r * is used when the last field is not delimited normally but is * "everything from here to the end of the line". If this option is * not present, extra fields are silently ignored in the input (but * note that if -m is given, there are no "extra fields"). * * The -m option is used to provide a repeating pattern of arguments * without having to specify them all. The list of specifications * from the -m to the end is remembered, and if there are more input * columns than specifications, the argument list is used over * starting from the spec following the -m. -m must be followed by at * least one column specification; the current implementation does not * check this condition unless and until some input line contains more * columns than there are specifications, but this should not be * relied upon. * * -j and -x specify regexes that control whether lines are processed * or just copied unchanged. The default if none match a line is the * complement of the action specified by the last such option. An * empty regex never matches, and can be used to change the default * without affecting what liens are processed. * * When both -m and -r are given, -r takes precedence. * * halign puts one space between columns on output by default. If the * -o or -O options are used, this is changed. The -o option * specifies the string that is printed between the two columns it * appears between; -o options before the first field spec or after * the last are silently ignored (except as noted below). -O changes * the default separator string; it not only changes the separator as * -o does, but also changes the separator used for all later breaks * that don't have a separator specified with -o. (When -m is used * with -O, separators are generated based on attaching separators to * the fields that precede them as the argument list is scanned; * columns duplicated by the action of -m keep their original * following separators. This is the only case where specifying -o * after the last field can be useful.) * * If the last line of input is missing its trailing newline, one will * be silently supplied. * * halign normally strips all trailing blanks from each line on output, * regardless of field boundaries. If the -s option is given, this * stripping is suppressed; all lines will come out the same width. * Only spaces are stripped, regardless of what pad characters may * have been specified for the fields. * * All adjusting is done with spaces. * * Each character is assumed to occupy one column on output, even tabs, * newlines, backspaces, etc. * * Examples: * * % cat x * root 0 10 / /bin/csh * mailer 0 10 / /bin/csh * daemon 1 666 / * uucp 66 1 /usr/spool/uucppublic /usr/lib/uucp/uucico * wnj 8 10 /u1/guest/wnj /bin/csh * mckusick 9 10 /u1/guest/mckusick /bin/csh * % halign -m l < x * root 0 10 / /bin/csh * mailer 0 10 / /bin/csh * daemon 1 666 / * uucp 66 1 /usr/spool/uucppublic /usr/lib/uucp/uucico * wnj 8 10 /u1/guest/wnj /bin/csh * mckusick 9 10 /u1/guest/mckusick /bin/csh * % halign l r5 r5 c c < x * root 0 10 / /bin/csh * mailer 0 10 / /bin/csh * daemon 1 666 / * uucp 66 1 /usr/spool/uucppublic /usr/lib/uucp/uucico * wnj 8 10 /u1/guest/wnj /bin/csh * mckusick 9 10 /u1/guest/mckusick /bin/csh * * halign must of course read to the end of its input before generating * any output, so that it knows how wide the columns must be. The * input column entries are copied into temporary files (in /tmp). * * The current incarnation cannot deal with more columns than it has * file descriptors available; the numeric value of this limit varies * from system to system and is, typically, a little under 20, 32, or * 64. Columns with an x key character do not count against this * limit. * * Idea credit for the -t option, the stripping of trailing blanks, and * the treatment of extra columns on input goes to Dave Martindale (at * the time, onfcanim!dave; I don't know where he is now). He also * wrote the initial version of the code to avoid copying the fields * again on output. * * This program is in the public domain. Anyone may use it in any way * for any purpose. Of course, it's also up to you to determine * whether what it does is suitable for you; the above comments may * help, but I can't promise they're accurate. It's free, and you get * what you pay for. * * If you find any bugs I would appreciate hearing about them, * especially if you also fix them. * * der Mouse * * mouse@rodents.montreal.qc.ca */ #include #include #include #include #include #include #include #include extern const char *__progname; typedef struct spec SPEC; typedef struct rxlist RXLIST; struct rxlist { RXLIST *link; const char *re; regex_t comp; int action; /* JX_ values are used not only for action values but also as indicators in the control file, so they must fit in a char. */ #define JX_PROCESS 1 #define JX_SKIP 2 } ; struct spec { char spec; #define SPEC_X 0 #define SPEC_L 1 #define SPEC_R 2 #define SPEC_CL 3 #define SPEC_CR 4 char flags; #define FLAG_C 0x01 #define FLAG_DOT 0x02 #define FLAG_SAWDOT 0x04 int dotoff; int tfd; FILE *tf; char *buf; int minwidth; int maxwidth; int maxdot; int maxafter; char pad_l; char pad_r; const char *sep; } ; static int errs; static int ncols; static int nlines; static const char *defsep; static SPEC *spec; static char tfilename[80]; static char sepchar = '\0'; static int colno; static int colwidth; static int debugging = 0; static int outspaces; static int bflag = 0; static char *Bflag = 0; static int rflag = 0; static int sflag = 0; static int mflag = -1; static int Cflag = 0; static int dotflag = 0; static RXLIST *rxlist = 0; static RXLIST **rxtail = &rxlist; static int defact = JX_PROCESS; static char *Bptr; static int quote; static int backslashed; static int inleadin; static FILE *ctl; static unsigned char *iline; static int ile; static int ila; static int ill; static int ilx; static int open_tmp(void) { int fd; while (1) { fd = open(&tfilename[0],O_RDWR|O_CREAT|O_TRUNC|O_EXCL,0644); if ((fd < 0) && (errno != EEXIST)) { fprintf(stderr,"%s: can't open temporary file %s: %s\n",__progname,&tfilename[0],strerror(errno)); return(-1); } if (unlink(&tfilename[0]) < 0) { fprintf(stderr,"%s: can't remove temporary file %s: %s\n",__progname,&tfilename[0],strerror(errno)); return(-1); } if (fd >= 0) return(fd); } } static int opentfile(SPEC *s) { if (spec[ncols].spec == SPEC_X) { s->tfd = -1; return(0); } s->tfd = open_tmp(); return(s->tfd<0); } static int fopentfile(SPEC *s) { if (s->tfd >= 0) { s->tf = fdopen(s->tfd,"r+"); if (! s->tf) { fprintf(stderr,"%s: cannot fdopen: %s\n",__progname,strerror(errno)); return(1); } } else { s->tf = 0; } return(0); } static void charout(char c) { if (! sflag) { if (c == ' ') { outspaces ++; return; } for (;outspaces>0;outspaces--) { putchar(' '); } } putchar(c); } static void growcolumns(void) { int nmore; int i; nmore = ncols - mflag; if (debugging >= 1) { printf("growcolumns: ncols = %d, mflag = %d, nmore = %d, nlines = %d\n", ncols,mflag,nmore,nlines); } if (nmore <= 0) { fprintf(stderr,"%s: -m must be followed by a column spec\n",__progname); exit(1); } spec = (SPEC *) realloc((char *)spec,(ncols+nmore)*sizeof(SPEC)); if (spec == 0) { fprintf(stderr,"Out of memory, sorry\n"); exit(1); } for (;nmore>0;nmore--) { spec[ncols] = spec[mflag]; spec[ncols].maxwidth = 0; spec[ncols].maxdot = 0; spec[ncols].maxafter = 0; spec[ncols].flags &= ~FLAG_SAWDOT; if (opentfile(&spec[ncols]) || fopentfile(&spec[ncols])) exit(1); if (spec[ncols].tf) { for (i=0;i= ncols) { if (mflag >= 0) { growcolumns(); } else { return; } } if (spec[colno].spec != SPEC_X) { if (spec[colno].flags & FLAG_DOT) { if (! (spec[colno].flags & FLAG_SAWDOT)) spec[colno].dotoff = colwidth; if (spec[colno].dotoff > spec[colno].maxdot) spec[colno].maxdot = spec[colno].dotoff; if (colwidth-spec[colno].dotoff > spec[colno].maxafter) spec[colno].maxafter = colwidth - spec[colno].dotoff; if (debugging >= 1) { fprintf(stderr,"col %d end, maxdot now %d, maxafter now %d\n",colno,spec[colno].maxdot,spec[colno].maxafter); } } else { if (colwidth > spec[colno].maxwidth) spec[colno].maxwidth = colwidth; if (debugging >= 1) { fprintf(stderr,"col %d end, maxwidth now %d\n",colno,spec[colno].maxwidth); } } putc('\n',spec[colno].tf); } colno ++; colwidth = 0; } static void begin_new_line(void) { inleadin = 1; Bptr = Bflag; } static void endline(void) { int i; while (colno < ncols) endcolumn(); if (debugging >= 1) fprintf(stderr,"line %d end\n",nlines); nlines ++; colno = 0; colwidth = 0; for (i=0;i= 2) fprintf(stderr,"char `%c' in col %d: ",c,colno); if (colno >= ncols) { if (mflag >= 0) { growcolumns(); } else { if (debugging >= 2) fprintf(stderr,"no col\n"); return; } } if ((c == '.') && (spec[colno].flags & FLAG_DOT) && !(spec[colno].flags & FLAG_SAWDOT)) { spec[colno].flags |= FLAG_SAWDOT; spec[colno].dotoff = colwidth; } colwidth ++; if (spec[colno].spec == SPEC_X) { if (debugging >= 2) fprintf(stderr,"x, width now %d\n",colwidth); return; } putc(c,spec[colno].tf); if (debugging >= 2) fprintf(stderr,"width now %d\n",colwidth); } static void add_regex(const char *re, int act) { if (*re) { RXLIST *r; int e; r = malloc(sizeof(RXLIST)); r->re = re; e = regcomp(&r->comp,re,REG_EXTENDED|REG_NOSUB); if (e) { int l; l = regerror(e,&r->comp,0,0); { char ebuf[l]; regerror(e,&r->comp,&ebuf[0],l); fprintf(stderr,"%s: invalid RE %s: %s\n",__progname,re,&ebuf[0]); } errs ++; } r->action = act; r->link = 0; *rxtail = r; rxtail = &r->link; } defact = JX_PROCESS + JX_SKIP - act; } static int geti(void) { __label__ retc; int c; static void action_for_line(int len) { RXLIST *r; int a; a = defact; for (r=rxlist;r;r=r->link) { int e; regmatch_t m; m.rm_so = 0; m.rm_eo = len; e = regexec(&r->comp,iline,0,&m,REG_STARTEND); if (e == 0) { a = r->action; break; } else if (e != REG_NOMATCH) { int l; l = regerror(e,&r->comp,0,0); { char ebuf[l]; regerror(e,&r->comp,&ebuf[0],l); fprintf(stderr,"%s: regexec() error for %s: %s\n",__progname,r->re,&ebuf[0]); } exit(1); } } switch (a) { case JX_PROCESS: putc(JX_PROCESS,ctl); ilx = 1; c = iline[0]; goto retc; break; case JX_SKIP: putc(JX_SKIP,ctl); fwrite(&ill,sizeof(int),1,ctl); fwrite(iline,1,ill,ctl); break; default: abort(); break; } } if (! rxlist) return(getchar()); if (ile) return(EOF); if (iline && (ilx < ill)) return(iline[ilx++]); ill = 0; while (1) { c = getchar(); if (c == EOF) { ile = 1; if (ill > 0) action_for_line(ill); return(EOF); } if (ill >= ila) iline = realloc(iline,ila=ill+32); iline[ill++] = c; if (c == '\n') { action_for_line(ill-1); ill = 0; } } retc:; return(c); } int main(int, char **); int main(int ac, char **av) { int i; int j; int k; int l; int any; char *cp; const char *ccp; int c; errs = 0; ncols = 0; defsep = " "; spec = (SPEC *) malloc(ac*sizeof(SPEC)); /* a maximum (unless -m) */ sprintf(tfilename,"/tmp/halign.%d",getpid()); for (ac--,av++;ac;ac--,av++) { if (**av == '-') { for (++*av;**av;++*av) { switch (**av) { default: fprintf(stderr,"%s: bad flag -%c\n",__progname,**av); errs ++; break; case 'b': bflag ++; break; case 'B': Bflag = &av[0][1]; goto eatopt; /* break from for loop */ break; case 'c': /* (undocumented) backward compatability */ Cflag ++; break; case 'C': Cflag ++; break; case 'D': debugging ++; break; case 'F': sepchar = av[0][1]; if (sepchar != '\0') ++*av; break; case 'j': add_regex(&av[0][1],JX_PROCESS); goto eatopt; /* break from for loop */ break; case 'm': mflag = ncols; break; case 'o': if (ncols > 0) spec[ncols-1].sep = &av[0][1]; goto eatopt; /* break from for loop */ break; case 'O': if (ncols > 0) spec[ncols-1].sep = &av[0][1]; defsep = &av[0][1]; goto eatopt; /* break from for loop */ break; case 'r': rflag ++; break; case 's': sflag ++; break; case 't': sepchar = '\t'; break; case 'x': add_regex(&av[0][1],JX_SKIP); goto eatopt; /* break from for loop */ break; case '.': dotflag ++; break; } } eatopt:; } else { spec[ncols].minwidth = 0; spec[ncols].maxwidth = 0; spec[ncols].maxdot = 0; spec[ncols].maxafter = 0; spec[ncols].pad_l = ' '; spec[ncols].pad_r = ' '; spec[ncols].sep = defsep; spec[ncols].flags = (Cflag ? FLAG_C : 0) | (dotflag ? FLAG_DOT : 0); switch (**av) { case 'c': switch (*++*av) { case 'l': spec[ncols].spec = SPEC_CL; break; case 'r': spec[ncols].spec = SPEC_CR; break; default: spec[ncols].spec = SPEC_CL; --*av; break; } break; case 'l': spec[ncols].spec = SPEC_L; break; case 'r': spec[ncols].spec = SPEC_R; break; case 'x': spec[ncols].spec = SPEC_X; break; default: fprintf(stderr, "%s: `%c' is not a valid specification (c, l, r, or x)\n", __progname,**av); errs ++; continue; break; } ++*av; while (**av && !isdigit(**av)) { switch (**av) { case 'C': spec[ncols].flags ^= FLAG_C; break; case '.': spec[ncols].flags ^= FLAG_DOT; break; case 'p': if (av[0][1]) { ++*av; spec[ncols].pad_l = **av; spec[ncols].pad_r = **av; } else { fprintf(stderr,"%s: no character for p field flag\n",__progname); errs ++; } break; case 'P': if (av[0][1] && av[0][2]) { spec[ncols].pad_l = *++*av; spec[ncols].pad_r = *++*av; } else { fprintf(stderr,"%s: missing character(s) for P field flag\n",__progname); errs ++; } break; default: fprintf(stderr,"%s: bad field flag `%c'\n",__progname,**av); errs ++; break; } ++*av; } if (**av) spec[ncols].minwidth = atoi(*av); if (opentfile(&spec[ncols])) errs ++; ncols ++; } } if (errs || (ncols <= 0)) { fprintf(stderr,"\ Usage: %s [-b] [-Bstr] [-C] [-.] [-r] [-s] [-t] [-Fc] [-[jx]RE] [ [c[lr]lrx]{C,.,px,Pxy}[n] | -m ] ...\n", __progname); exit(1); } if (rxlist) { int t; t = open_tmp(); if (t < 0) exit(1); ctl = fdopen(t,"r+"); if (! ctl) { fprintf(stderr,"%s: cannot fdopen: %s\n",__progname,strerror(errno)); exit(1); } } for (i=0;i 0) || (colwidth > 0)) { c = '\n'; backslashed = 0; /* are these the Right Thing? */ quote = 0; } else { break; } } if (spec[colno].flags & FLAG_C) { if (backslashed) { backslashed = 0; colchar(c); continue; } if (quote) { if ((c == quote) && !backslashed) { quote = 0; } else if (c == '\\') { backslashed = 1; } else if (c == '\n') { endline(); continue; } colchar(c); continue; } else { if ((c == '"') || (c == '\'')) { quote = c; colchar(c); continue; } } } if (c == '\n') { endline(); continue; } if ( rflag && (colno >= ncols-1) && (colwidth || sepchar) ) { colchar(c); continue; } if (Bptr) { colchar(c); if ((char)c == *Bptr) { Bptr ++; if (! *Bptr) { Bptr = 0; inleadin = 0; } } else { int i; int n; n = Bptr - Bflag; for (i=1;i0;l--) { c = getc(ctl); if (c == EOF) { fprintf(stderr,"%s: unexpected EOF in control file\n",__progname); exit(1); } putchar(c); } } continue; break; case JX_PROCESS: if (nlines < 1) { fprintf(stderr,"%s: control file too long\n",__progname); exit(1); } break; default: fprintf(stderr,"%s: unexpected control byte %02x in control file\n",__progname,c); exit(1); } nlines --; any = 0; for (i=0;i 0) && (cp[j-1] == '\n')) cp[--j] = '\0'; l = spec[i].maxwidth - j; if (spec[i].flags & FLAG_DOT) { char *dp; dp = index(cp,'.'); if (dp == 0) dp = cp + strlen(cp); k = spec[i].maxdot - (dp - cp); } else { switch (spec[i].spec) { case SPEC_L: k = 0; break; case SPEC_CL: k = l / 2; break; case SPEC_CR: k = (l + 1) / 2; break; case SPEC_R: k = l; break; } } l -= k; if (any) for (ccp=spec[i-1].sep;*ccp;ccp++) charout(*ccp); for (;k>0;k--) charout(spec[i].pad_l); for (cp=spec[i].buf;*cp;cp++) charout(*cp); for (;l>0;l--) charout(spec[i].pad_r); any ++; } } outspaces = 0; charout('\n'); } out:; exit(0); }