#include #include #include #include #include #include "cpu_time.c" /***** default values of parameters ******************************************/ #define LMAX 80 /* max number of characters in a line */ #define LMIN 70 /* min number of characters in a line */ #define IND1 0 /* indent (#scp) of the 1st paragraph */ #define IND2 0 /* indent (#scp) of other paragraph(s) */ #define LINES 1 /* #empty lines between paragraphs */ #define CTYPE 2 /* cost type (1: simple, 2: migi zoroe) */ #define EXP 5 /* exponent of the cost */ #define PSLIDE 1 /* slide '.' and ',' to the right or not (0/1) (available only if ctype=2) */ #define OUTLVL 0 /* output level (0: no extra, 1: with cost etc., 2: cost of every text width) */ typedef struct { char *ostr; /* original character string */ int tnwords; /* total number of words */ int maxwl; /* max length of words */ int np; /* number of paragraphs */ int *nwords; /* number of words for each paragraph */ char ***whead; /* head of a word */ char ***wtail; /* tail of a word */ int **nchars; /* number of characters in a word */ }TXTdata; /* data of the given text */ typedef struct { double cost; /* original character string */ int txtw; /* text width */ int **spc; /* space after a word */ }SOLdata; /* solution data */ typedef struct { int lmin; /* minimum number of characters in a line */ int lmax; /* minimum number of characters in a line */ int ind1; /* indent (#scp) of the 1st paragraph */ int ind2; /* indent (#scp) of other paragraph(s) */ int lines; /* #empty lines between paragraphs */ int ctype; /* cost type (1: simple, 2: migi zoroe) */ int outlvl; /* output level */ int pslide; /* slide '.' and ',' or not (0/1) */ double exp; /* exponent of the cost */ }Param; /* solution data */ /*************************** functions ***************************************/ void *malloc_e(size_t size); void *realloc_e(void *s, size_t size); void read_data(TXTdata *txtdata); void modify_data(TXTdata *txtdata, Param *param); void free_mems_main(TXTdata *txtdata, SOLdata *sol); void prepare_memory_for_sol(TXTdata *txtdata, SOLdata *sol); void copy_parameters(int argc, char *arcv[], Param *param); void output_layout(TXTdata *txtdata, SOLdata *bestsol, Param *param, double starttime); /***** local functions for typeset *******************************************/ void solve_typeset(TXTdata *txtdata, SOLdata *bestsol, Param *param); void solve_typeset_fixedw(TXTdata *txtdata, SOLdata *sol, int txtwidth, Param *param); void update_bestsol(SOLdata *sol, SOLdata *bestsol); void free_mems_typeset(SOLdata *sol); double cost(TXTdata *txtdata, int np, int i, int j, int txtwidth, Param *param, int clength); void layout(TXTdata *txtdata, int np, int i, int j, int txtwidth, Param *param, SOLdata *sol); /***** copy and read the parameters ******************************************/ void copy_parameters(int argc, char *argv[], Param *param) { int i; /**** copy the parameters ****/ param->lmax = LMAX; param->lmin = LMIN; param->ind1 = IND1; param->ind2 = IND2; param->lines = LINES; param->ctype = CTYPE; param->outlvl = OUTLVL; param->pslide = PSLIDE; param->exp = EXP; /**** read the parameters ****/ if(argc>0 && (argc % 2)==0){ printf("USAGE: ./typeset [param_name, param_value] [name, value]...\n"); printf(" e.g.: gunzip -c data/article1.gz | ./typeset lmin 60 outlvl 1\n"); printf(" Options (default values are shown in []).\n"); printf(" lmax [%d] maximum number of characters in a line\n", LMAX); printf(" lmin [%d] minimum number of characters in a line\n", LMIN); printf(" ind1 [%d] indent (#spaces) of the 1st paragraph\n", IND1); printf(" ind2 [%d] indent of the other paragraph(s)\n", IND2); printf(" lines [%d] the number of empty lines between paragraphs\n", LINES); printf(" ctype [%d] cost type (1: simple, 2: right justified)\n", CTYPE); printf(" outlvl [%d] output level (0: results only, 1: cost etc.,\n", OUTLVL); printf(" 2: intermediate results)\n"); printf(" pslide [%d] slide '.' and ',' outside the right line or not (0/1)\n", PSLIDE); printf(" exp [%d] exponent of the cost\n", EXP); exit(EXIT_FAILURE);} else{ for(i=1; ilmax = atoi(argv[i+1]); if(strcmp(argv[i],"lmin")==0) param->lmin = atoi(argv[i+1]); if(strcmp(argv[i],"ind1")==0) param->ind1 = atoi(argv[i+1]); if(strcmp(argv[i],"ind2")==0) param->ind2 = atoi(argv[i+1]); if(strcmp(argv[i],"lines")==0) param->lines = atoi(argv[i+1]); if(strcmp(argv[i],"ctype")==0) param->ctype = atoi(argv[i+1]); if(strcmp(argv[i],"outlvl")==0) param->outlvl = atoi(argv[i+1]); if(strcmp(argv[i],"pslide")==0) param->pslide = atoi(argv[i+1]); if(strcmp(argv[i],"exp")==0) param->exp = atof(argv[i+1]); } } } /***** malloc with error check ***********************************************/ void *malloc_e( size_t size ) { void *s; if ( (s=malloc(size)) == NULL ) { fprintf( stderr, "malloc : Not enough memory.\n" ); exit( EXIT_FAILURE ); } return s; } /***** realloc with error check **********************************************/ void *realloc_e( void *s, size_t size ) { void *snew; if ( (snew = realloc(s, size)) == NULL ) { fprintf( stderr, "realloc : Not enough memory.\n" ); exit( EXIT_FAILURE ); } return snew; } /***** read the text data from stdin *****************************************/ void read_data(TXTdata *txtdata) { int nchar; /* number of characters currently read */ int ostr_size; /* current size of "original string" */ char ch; /* current character */ FILE *fp=stdin; int i; int nw_in_line; /* number of words in a line */ int blank_prev; /* 1 -> prev char was spc/tab/return */ int blank_line_prv; /* 1 -> prev line was blank */ int np; /* number of paragraphs */ int tnwords; /* total number of words */ int nwords; /* number of words in a paragraph */ int nchars; /* number of characters in a word */ /**** store the given text into "txtdata->ostr" ****/ ostr_size = 8; txtdata->ostr = malloc_e(ostr_size * sizeof(char)); for(nchar = 0;;) { ch = fgetc(fp); if(ch == EOF){ break; } if(nchar >= ostr_size) { ostr_size *= 2; txtdata->ostr = realloc_e(txtdata->ostr, ostr_size * sizeof(char)); /*printf(" ostr: [%d %d]\n", (int) txtdata->ostr, (int) (txtdata->ostr + ostr_size -1));*/ } txtdata->ostr[nchar] = ch; nchar++; } if(nchar+2 > ostr_size){ ostr_size = nchar+2; txtdata->ostr = realloc_e(txtdata->ostr, ostr_size * sizeof(char)); } txtdata->ostr[nchar] = txtdata->ostr[nchar+1] = '\n'; nchar += 2; /*for(i=0; iostr[i]); }*/ /**** count the #paragraphs & #words ****/ nw_in_line = 0; blank_prev = 1; blank_line_prv = 1; np = tnwords = 0; for(i=0; iostr[i]; if(ch == '\n') { if(nw_in_line > 0) { if(blank_line_prv == 1) { np++; } blank_line_prv = 0; } else {blank_line_prv = 1;} tnwords += nw_in_line; nw_in_line = 0; blank_prev = 1; } else if(ch == ' ' || ch == '\t') { blank_prev = 1; } else { if(blank_prev == 1) { nw_in_line++; } blank_prev = 0; } } /*printf("#paragraphs: %d, #all words: %d\n", np, tnwords);*/ /**** get memory space for paragraphs and words ****/ txtdata->nwords = (int *) malloc_e(np * sizeof(int)); txtdata->whead = (char ***) malloc_e(np * sizeof(char **)); txtdata->wtail = (char ***) malloc_e(np * sizeof(char **)); txtdata->nchars = (int **) malloc_e(np * sizeof(int *)); txtdata->whead[0] = (char **) malloc_e(tnwords * sizeof(char *)); txtdata->wtail[0] = (char **) malloc_e(tnwords * sizeof(char *)); txtdata->nchars[0] = (int *) malloc_e(tnwords * sizeof(int)); /* printf(" nwords: [%d %d]\n", (int) txtdata->nwords, (int) (txtdata->nwords + np-1)); printf(" whead: [%d %d]\n", (int) txtdata->whead, (int) (txtdata->whead + np-1)); printf(" wtail: [%d %d]\n", (int) txtdata->wtail, (int) (txtdata->wtail + np-1)); printf(" nchars: [%d %d]\n", (int) txtdata->nchars, (int) (txtdata->nchars + np-1)); printf(" *whead: [%d %d]\n", (int) txtdata->whead[0], (int) (txtdata->whead[0] + tnwords-1)); printf(" *wtail: [%d %d]\n", (int) txtdata->wtail[0], (int) (txtdata->wtail[0] + tnwords-1)); printf("*nchars: [%d %d]\n", (int) txtdata->nchars[0], (int) (txtdata->nchars[0] + tnwords-1));*/ /**** store data in "txtdata" ****/ txtdata->np = np; txtdata->tnwords = tnwords; nw_in_line = 0; blank_prev = 1; blank_line_prv = 1; np = nwords = nchars = 0; for(i=0; iostr[i]; if(ch == ' ' || ch == '\t' || ch == '\n') { if(blank_prev == 0) { /* a word ends */ txtdata->wtail[np][nwords] = &(txtdata->ostr[i-1]); txtdata->nchars[np][nwords] = nchars; nw_in_line++; nwords++; } blank_prev = 1; } else { if(blank_prev == 1) { /* a word begins */ txtdata->whead[np][nwords] = &(txtdata->ostr[i]); nchars = 1; } else { nchars++; } blank_prev = 0; } if(ch == '\n') { if(nw_in_line > 0) { blank_line_prv = 0; } else { if(blank_line_prv == 0) { /* a paragraph ends */ txtdata->nwords[np] = nwords; if(np+1 < txtdata->np) { txtdata->whead[np+1] = txtdata->whead[np] + nwords; txtdata->wtail[np+1] = txtdata->wtail[np] + nwords; txtdata->nchars[np+1] = txtdata->nchars[np] + nwords; np++; } nwords = 0; } blank_line_prv = 1; } nw_in_line = 0; } } /**** print out for debug ****/ /*for(i=0; iostr[i]); }*/ /*printf("#all words: %d\n", txtdata->tnwords); for(np = 0; np < txtdata->np; np++) { printf("#parargraph %d, #words: %d\n", np+1, txtdata->nwords[np]); for(nwords = 0; nwords < txtdata->nwords[np]; nwords++) { for(nchars = 0; nchars < txtdata->nchars[np][nwords]; nchars++) { printf("%c!", txtdata->whead[np][nwords][nchars]); } if( (nwords+1) % 10 == 0 ) { printf("\n"); } else { printf(" "); } } printf("\n"); }*/ } /***** prepare memory for a solution *****************************************/ void prepare_memory_for_sol(TXTdata *txtdata, SOLdata *sol) { int np, nwords; /**** prepare memory space ****/ sol->spc = (int **) malloc_e(txtdata->np * sizeof(int *)); sol->spc[0] = (int *) malloc_e(txtdata->tnwords * sizeof(int)); for(np = 1; np < txtdata->np; np++) { sol->spc[np] = sol->spc[np-1] + txtdata->nwords[np-1]; } /**** initialize cost ****/ sol->cost = INT_MAX; /**** initialize sol just in case ****/ for(np = 0; np < txtdata->np; np++) { for(nwords = 0; nwords < txtdata->nwords[np]; nwords++) { sol->spc[np][nwords] = 1; if((nwords+1)%10 == 0 || nwords == txtdata->nwords[np] - 1) { sol->spc[np][nwords] = -1; } } } } /***** modify the 1st word of each paragraph adding indent *******************/ /***** then comput the max length of a word, and modify lmin and/or lmax *****/ void modify_data(TXTdata *txtdata, Param *param) { int indent; /* indent */ char *word; /* word */ int np, nwords, i, nchars, maxwl, wl; /***** modify the 1st word of each paragraph adding indent ****/ for(np = 0; np < txtdata->np; np++) { if(np == 0) { indent = param->ind1; } else { indent = param->ind2; } nchars = txtdata->nchars[np][0]; word = (char *) malloc_e((indent + nchars) * sizeof(char)); for(i = 0; i < indent; i++) { word[i] = ' '; } for(i = 0; i < nchars; i++) { word[i+indent] = txtdata->whead[np][0][i]; } nchars += indent; txtdata->whead[np][0] = word; txtdata->wtail[np][0] = word + nchars; txtdata->nchars[np][0] = nchars; } /**** compute max length of a word ****/ maxwl = 0; for(np = 0; np < txtdata->np; np++) { for(nwords = 0; nwords < txtdata->nwords[np]; nwords++) { wl = txtdata->nchars[np][nwords]; if(wl > maxwl) { maxwl = wl; } } } /**** modify lmin and/or lmax if necessary ****/ if(param->lmin < maxwl) { param->lmin = maxwl; } if(param->lmax < param->lmin) { param->lmax = param->lmin; } } /***** output the best layout ************************************************/ void output_layout(TXTdata *txtdata, SOLdata *bestsol, Param *param, double starttime) { int np, nwords, i; if(param->outlvl >= 2) { printf("\n"); } for(np = 0; np < txtdata->np; np++) { if(np > 0) { for(i = 0; i < param->lines; i++) { printf("\n"); } } for(nwords = 0; nwords < txtdata->nwords[np]; nwords++) { for(i = 0; i < txtdata->nchars[np][nwords]; i++) { /* print a word */ printf("%c", txtdata->whead[np][nwords][i]); } if(bestsol->spc[np][nwords] == -1) { printf("\n"); } else { for(i = 0; i < bestsol->spc[np][nwords]; i++) { printf(" "); } } } } if(param->outlvl >= 1) { printf("\n# text width: %d cost: %g", bestsol->txtw, bestsol->cost); printf(" time: %g seconds.\n", cpu_time() - starttime); } } /***** free memory cells used in the upper level *****************************/ void free_mems_main(TXTdata *txtdata, SOLdata *bestsol) { int np; for(np = 0; np < txtdata->np; np++) { free((void *) txtdata->whead[np][0]); } free((void *) txtdata->ostr); free((void *) txtdata->whead[0]); free((void *) txtdata->whead); free((void *) txtdata->wtail[0]); free((void *) txtdata->wtail); free((void *) txtdata->nchars[0]); free((void *) txtdata->nchars); free((void *) bestsol->spc[0]); free((void *) bestsol->spc); } /***** SOLVER BELOW **********************************************************/ /***** top routine of the solver *********************************************/ void solve_typeset(TXTdata *txtdata, SOLdata *bestsol, Param *param) { SOLdata sol; /* a solution */ int txtwidth; /* text width */ prepare_memory_for_sol(txtdata, &sol); for(txtwidth = param->lmin; txtwidth <= param->lmax; txtwidth++) { solve_typeset_fixedw(txtdata, &sol, txtwidth, param); update_bestsol(&sol, bestsol); } free_mems_typeset(&sol); } /***** solve typeset problem with a fixed text width *************************/ void solve_typeset_fixedw(TXTdata *txtdata, SOLdata *sol, int txtwidth, Param *param) { int np, max_nwords, i, j, clength, dpub; double newcost; double *dptable; int *dpbestj; int overshoot; overshoot = 1; /* parameter: in DP, j is considered up to txtwidth + overshoot */ dpub = txtwidth + overshoot; sol->cost = 0; sol->txtw = txtwidth; max_nwords = 0; for(np = 0; np < txtdata->np; np++) { if(txtdata->nwords[np] > max_nwords) { max_nwords = txtdata->nwords[np]; } } dptable = (double *) malloc_e( (max_nwords+1) * sizeof(double) ); dpbestj = (int *) malloc_e( (max_nwords+1) * sizeof(int) ); for(np = 0; np < txtdata->np; np++) { /**** DP recursion ****/ dptable[txtdata->nwords[np]] = 0; for(i = txtdata->nwords[np] - 1; i >= 0; i--) { dptable[i] = INT_MAX; clength = -1; for(j = i; j < txtdata->nwords[np]; j++) { clength += (txtdata->nchars[np][j] + 1); if(clength > dpub) { break; } newcost = dptable[j+1] + cost(txtdata, np, i, j, txtwidth, param, clength); if(newcost < dptable[i]) { dptable[i] = newcost; dpbestj[i] = j; } } } /**** store optimal solution ****/ sol->cost += dptable[0]; for(i = 0; i < txtdata->nwords[np];) { j = dpbestj[i]; layout(txtdata, np, i, j, txtwidth, param, sol); i = j + 1; } } if(param->outlvl >= 2) { printf("# text width: %d cost: %g\n", txtwidth, sol->cost); } free((void *) dptable); free((void *) dpbestj); } /***** update bestsol ********************************************************/ void update_bestsol(SOLdata *sol, SOLdata *bestsol) { int **tmpspc; if(sol->cost < bestsol->cost) { bestsol->cost = sol->cost; bestsol->txtw = sol->txtw; tmpspc = sol->spc; sol->spc = bestsol->spc; bestsol->spc = tmpspc; } } /***** cost of writing ith-jth words in a line *******************************/ double cost(TXTdata *txtdata, int np, int i, int j, int txtwidth, Param *param, int clength) { int diff, nsp, avg_excess, n_max_excess; char lastchar; double cval; diff = txtwidth - clength; switch (param->ctype) { case 1: if(diff < 0) { return INT_MAX; } else if(j == txtdata->nwords[np]-1) { return 0; } else { return pow(diff, param->exp); } break; case 2: lastchar = txtdata->wtail[np][j][0]; if(param->pslide == 1) {if(lastchar == '.' || lastchar == ',') { diff++; }} if(diff < 0) { return INT_MAX; } else if(diff == 0) { return 0; } else if(j == txtdata->nwords[np]-1) { return 0; } else { nsp = j - i; if(nsp == 0) { nsp = 1; } avg_excess = (int) diff / nsp; n_max_excess = diff - (avg_excess * nsp); cval = (nsp - n_max_excess) * pow(avg_excess, param->exp); cval += (n_max_excess * pow(avg_excess+1, param->exp)); return cval; } break; default: printf("parameter 'ctype' is invalid\n"); exit(EXIT_FAILURE); } } /***** design layout of writing ith-jth words in a line **********************/ void layout(TXTdata *txtdata, int np, int i, int j, int txtwidth, Param *param, SOLdata *sol) { int k, csum, diff, nsp, avg_excess, n_max_excess, intvl, shift; char lastchar; sol->spc[np][j] = -1; switch (param->ctype) { case 1: for(k = i; k < j; k++) { sol->spc[np][k] = 1; } break; case 2: csum = -1; for(k = i; k <= j; k++) { csum += (txtdata->nchars[np][k]+1); } diff = txtwidth - csum; lastchar = txtdata->wtail[np][j][0]; if(param->pslide == 1) {if(lastchar == '.' || lastchar == ',') { diff++; }} if(diff <= 0 || j == txtdata->nwords[np]-1) { for(k = i; k < j; k++) { sol->spc[np][k] = 1; } } else { nsp = j - i; if(nsp == 0) { nsp = 1; } avg_excess = (int) diff / nsp; for(k = i; k < j; k++) { sol->spc[np][k] = avg_excess + 1; } n_max_excess = diff - (avg_excess * nsp); if(n_max_excess > 0){ intvl = (int) nsp / n_max_excess; shift = (int) intvl / 2; for(k = i; k < j && n_max_excess > 0; k++) { if((k-i) % intvl == shift) { sol->spc[np][k]++; n_max_excess--; } } } } break; default: printf("parameter 'ctype' is invalid\n"); exit(EXIT_FAILURE); } } /***** free local memory space ***********************************************/ void free_mems_typeset(SOLdata *sol) { free((void *) sol->spc[0]); free((void *) sol->spc); } /***** main ******************************************************************/ int main(int argc, char *argv[]) { TXTdata txtdata; /* text data */ SOLdata bestsol; /* the best solution */ Param param; /* parameters */ double starttime; copy_parameters(argc, argv, ¶m); read_data(&txtdata); starttime = cpu_time(); modify_data(&txtdata, ¶m); prepare_memory_for_sol(&txtdata, &bestsol); solve_typeset(&txtdata, &bestsol, ¶m); output_layout(&txtdata, &bestsol, ¶m, starttime); free_mems_main(&txtdata, &bestsol); return EXIT_SUCCESS; }