Spaces:
Sleeping
Sleeping
/*****************************************************************/ | |
/* evalb [-p param_file] [-dh] [-e n] gold-file test-file */ | |
/* */ | |
/* Evaluate bracketing in test-file against gold-file. */ | |
/* Return recall, precision, tagging accuracy. */ | |
/* */ | |
/* <option> */ | |
/* -p param_file parameter file */ | |
/* -d debug mode */ | |
/* -e n number of error to kill (default=10) */ | |
/* -h help */ | |
/* */ | |
/* Satoshi Sekine (NYU) */ | |
/* Mike Collins (UPenn) */ | |
/* */ | |
/* October.1997 */ | |
/* */ | |
/* Please refer README for the update information */ | |
/*****************************************************************/ | |
/* Internal Data format -------------------------------------------*/ | |
/* */ | |
/* (S (NP (NNX this)) (VP (VBX is) (NP (DT a) (NNX pen))) (SYM .)) */ | |
/* */ | |
/* wn=5 */ | |
/* word label */ | |
/* terminal[0] = this NNX */ | |
/* terminal[1] = is VBX */ | |
/* terminal[2] = a DT */ | |
/* terminal[3] = pen NNX */ | |
/* terminal[4] = . SYM */ | |
/* */ | |
/* bn=4 */ | |
/* start end label */ | |
/* bracket[0] = 0 5 S */ | |
/* bracket[1] = 0 0 NP */ | |
/* bracket[2] = 1 4 VP */ | |
/* bracket[3] = 2 4 NP */ | |
/* */ | |
/* matched bracketing */ | |
/* Recall = --------------------------- */ | |
/* # of bracket in ref-data */ | |
/* */ | |
/* matched bracketing */ | |
/* Recall = --------------------------- */ | |
/* # of bracket in test-data */ | |
/* */ | |
/*-----------------------------------------------------------------*/ | |
/******************/ | |
/* constant macro */ | |
/******************/ | |
/*************/ | |
/* structure */ | |
/*************/ | |
typedef struct ss_terminal { | |
char word[MAX_WORD_LEN]; | |
char label[MAX_LABEL_LEN]; | |
int result; /* 0:unmatch, 1:match, 9:undef */ | |
} s_terminal; | |
typedef struct ss_term_ind { | |
s_terminal term; | |
int index; | |
int bracket; | |
int endslen; | |
int ends[MAX_BRACKET_IN_SENT]; | |
} s_term_ind; | |
typedef struct ss_bracket { | |
int start; | |
int end; | |
unsigned int buf_start; | |
unsigned int buf_end; | |
char label[MAX_LABEL_LEN]; | |
int result; /* 0: unmatch, 1:match, 5:delete 9:undef */ | |
} s_bracket; | |
typedef struct ss_equiv { | |
char *s1; | |
char *s2; | |
} s_equiv; | |
/****************************/ | |
/* global variables */ | |
/* gold-data: suffix = 1 */ | |
/* test-data: suffix = 2 */ | |
/****************************/ | |
/*---------------*/ | |
/* Sentence data */ | |
/*---------------*/ | |
int wn1, wn2; /* number of words in sentence */ | |
int r_wn1; /* number of words in sentence */ | |
/* which only ignores labels in */ | |
/* DELETE_LABEL_FOR_LENGTH */ | |
s_terminal terminal1[MAX_WORD_IN_SENT]; /* terminal information */ | |
s_terminal terminal2[MAX_WORD_IN_SENT]; | |
s_term_ind quotterm1[MAX_QUOTE_TERM]; /* special terminals ("'","POS") */ | |
s_term_ind quotterm2[MAX_QUOTE_TERM]; | |
int bn1, bn2; /* number of brackets */ | |
int r_bn1, r_bn2; /* number of brackets */ | |
/* after deletion */ | |
s_bracket bracket1[MAX_BRACKET_IN_SENT]; /* bracket information */ | |
s_bracket bracket2[MAX_BRACKET_IN_SENT]; | |
/*------------*/ | |
/* Total data */ | |
/*------------*/ | |
int TOTAL_bn1, TOTAL_bn2, TOTAL_match; /* total number of brackets */ | |
int TOTAL_sent; /* No. of sentence */ | |
int TOTAL_error_sent; /* No. of error sentence */ | |
int TOTAL_skip_sent; /* No. of skip sentence */ | |
int TOTAL_comp_sent; /* No. of complete match sent */ | |
int TOTAL_word; /* total number of word */ | |
int TOTAL_crossing; /* total crossing */ | |
int TOTAL_no_crossing; /* no crossing sentence */ | |
int TOTAL_2L_crossing; /* 2 or less crossing sentence */ | |
int TOTAL_correct_tag; /* total correct tagging */ | |
int TOT_cut_len = DEFAULT_CUT_LEN; /* Cut-off length in statistics */ | |
/* data for sentences with len <= CUT_LEN */ | |
/* Historically it was 40. */ | |
int TOT40_bn1, TOT40_bn2, TOT40_match; /* total number of brackets */ | |
int TOT40_sent; /* No. of sentence */ | |
int TOT40_error_sent; /* No. of error sentence */ | |
int TOT40_skip_sent; /* No. of skip sentence */ | |
int TOT40_comp_sent; /* No. of complete match sent */ | |
int TOT40_word; /* total number of word */ | |
int TOT40_crossing; /* total crossing */ | |
int TOT40_no_crossing; /* no crossing sentence */ | |
int TOT40_2L_crossing; /* 2 or less crossing sentence */ | |
int TOT40_correct_tag; /* total correct tagging */ | |
/*------------*/ | |
/* miscallous */ | |
/*------------*/ | |
int Line; /* line number */ | |
int Error_count = 0; /* Error count */ | |
int Status; /* Result status for each sent */ | |
/* 0: OK, 1: skip, 2: error */ | |
/*-------------------*/ | |
/* stack manuplation */ | |
/*-------------------*/ | |
int stack_top; | |
int stack[MAX_BRACKET_IN_SENT]; | |
/************************************************************/ | |
/* User parameters which can be specified in parameter file */ | |
/************************************************************/ | |
/*------------------------------------------*/ | |
/* Debug mode */ | |
/* print out data for individual sentence */ | |
/*------------------------------------------*/ | |
int DEBUG=0; | |
/*------------------------------------------*/ | |
/* MAX error */ | |
/* Number of error to stop the process. */ | |
/* This is useful if there could be */ | |
/* tokanization error. */ | |
/* The process will stop when this number*/ | |
/* of errors are accumulated. */ | |
/*------------------------------------------*/ | |
int Max_error = DEFAULT_MAX_ERROR; | |
/*------------------------------------------*/ | |
/* Cut-off length for statistics */ | |
/* int TOT_cut_len = DEFAULT_CUT_LEN; */ | |
/* (Defined above) */ | |
/*------------------------------------------*/ | |
/*------------------------------------------*/ | |
/* unlabeled or labeled bracketing */ | |
/* 0: unlabeled bracketing */ | |
/* 1: labeled bracketing */ | |
/*------------------------------------------*/ | |
int F_label = 1; | |
/*------------------------------------------*/ | |
/* Delete labels */ | |
/* list of labels to be ignored. */ | |
/* If it is a pre-terminal label, delete */ | |
/* the word along with the brackets. */ | |
/* If it is a non-terminal label, just */ | |
/* delete the brackets (don't delete */ | |
/* childrens). */ | |
/*------------------------------------------*/ | |
char *Delete_label[MAX_DELETE_LABEL]; | |
int Delete_label_n = 0; | |
/*------------------------------------------*/ | |
/* Delete labels for length calculation */ | |
/* list of labels to be ignored for */ | |
/* length calculation purpose */ | |
/*------------------------------------------*/ | |
char *Delete_label_for_length[MAX_DELETE_LABEL]; | |
int Delete_label_for_length_n = 0; | |
/*------------------------------------------*/ | |
/* Labels to be considered for misquote */ | |
/* (could be possesive or quote) */ | |
/*------------------------------------------*/ | |
char *Quote_term[MAX_QUOTE_TERM]; | |
int Quote_term_n = 0; | |
/*------------------------------------------*/ | |
/* Equivalent labels, words */ | |
/* the pairs are considered equivalent */ | |
/* This is non-directional. */ | |
/*------------------------------------------*/ | |
s_equiv EQ_label[MAX_EQ_LABEL]; | |
int EQ_label_n = 0; | |
s_equiv EQ_word[MAX_EQ_WORD]; | |
int EQ_word_n = 0; | |
/************************/ | |
/* Function return-type */ | |
/************************/ | |
int main(); | |
void init_global(); | |
void print_head(); | |
void init(); | |
void read_parameter_file(); | |
void set_param(); | |
int narg(); | |
int read_line(); | |
void pushb(); | |
int popb(); | |
int stackempty(); | |
void calc_result(unsigned char *buf1,unsigned char *buf); | |
void fix_quote(); | |
void reinsert_term(); | |
void massage_data(); | |
void modify_label(); | |
void individual_result(); | |
void print_total(); | |
void dsp_info(); | |
int is_terminator(); | |
int is_deletelabel(); | |
int is_deletelabel_for_length(); | |
int is_quote_term(); | |
int word_comp(); | |
int label_comp(); | |
void Error(); | |
void Fatal(); | |
void Usage(); | |
/* ### provided by std headers | |
int fprintf(); | |
int printf(); | |
int atoi(); | |
int fclose(); | |
int sscanf(); | |
*/ | |
/***********/ | |
/* program */ | |
/***********/ | |
int | |
main(argc,argv) | |
int argc; | |
char *argv[]; | |
{ | |
char *filename1, *filename2; | |
FILE *fd1, *fd2; | |
unsigned char buff[5000]; | |
unsigned char buff1[5000]; | |
filename1=NULL; | |
filename2=NULL; | |
for(argc--,argv++;argc>0;argc--,argv++){ | |
if(**argv == '-'){ | |
while(*++(*argv)){ | |
switch(**argv){ | |
case 'h': /* help */ | |
Usage(); | |
exit(1); | |
case 'd': /* debug mode */ | |
DEBUG = 1; | |
goto nextarg; | |
case 'D': /* debug mode */ | |
DEBUG = 2; | |
goto nextarg; | |
case 'c': /* cut-off length */ | |
ARG_CHECK("cut-off length for statistices"); | |
TOT_cut_len = atoi(*argv); | |
goto nextarg; | |
case 'e': /* max error */ | |
ARG_CHECK("number of error to kill"); | |
Max_error = atoi(*argv); | |
goto nextarg; | |
case 'p': /* parameter file */ | |
ARG_CHECK("parameter file"); | |
read_parameter_file(*argv); | |
goto nextarg; | |
default: | |
Usage(); | |
exit(0); | |
} | |
} | |
} else { | |
if(filename1==NULL){ | |
filename1 = *argv; | |
}else if(filename2==NULL){ | |
filename2 = *argv; | |
} | |
} | |
nextarg: continue; | |
} | |
init_global(); | |
if((fd1 = fopen(filename1,"r"))==NULL){ | |
Fatal("Can't open gold file (%s)\n",filename1); | |
} | |
if((fd2 = fopen(filename2,"r"))==NULL){ | |
Fatal("Can't open test file (%s)\n",filename2); | |
} | |
print_head(); | |
for(Line=1;fgets(buff,5000,fd1)!=NULL;Line++){ | |
init(); | |
/* READ 1 */ | |
r_wn1 = read_line(buff,terminal1,quotterm1,&wn1,bracket1,&bn1); | |
strcpy(buff1,buff); | |
/* READ 2 */ | |
if(fgets(buff,5000,fd2)==NULL){ | |
Error("Number of lines unmatch (too many lines in gold file)\n"); | |
break; | |
} | |
read_line(buff,terminal2,quotterm2,&wn2,bracket2,&bn2); | |
/* Calculate result and print it */ | |
calc_result(buff1,buff); | |
if(DEBUG>=1){ | |
dsp_info(); | |
} | |
} | |
if(fgets(buff,5000,fd2)!=NULL){ | |
Error("Number of lines unmatch (too many lines in test file)\n"); | |
} | |
print_total(); | |
return (0); | |
} | |
/*-----------------------------*/ | |
/* initialize global variables */ | |
/*-----------------------------*/ | |
void | |
init_global() | |
{ | |
TOTAL_bn1 = TOTAL_bn2 = TOTAL_match = 0; | |
TOTAL_sent = TOTAL_error_sent = TOTAL_skip_sent = TOTAL_comp_sent = 0; | |
TOTAL_word = TOTAL_correct_tag = 0; | |
TOTAL_crossing = 0; | |
TOTAL_no_crossing = TOTAL_2L_crossing = 0; | |
TOT40_bn1 = TOT40_bn2 = TOT40_match = 0; | |
TOT40_sent = TOT40_error_sent = TOT40_skip_sent = TOT40_comp_sent = 0; | |
TOT40_word = TOT40_correct_tag = 0; | |
TOT40_crossing = 0; | |
TOT40_no_crossing = TOT40_2L_crossing = 0; | |
} | |
/*------------------*/ | |
/* print head title */ | |
/*------------------*/ | |
void | |
print_head() | |
{ | |
printf(" Sent. Matched Bracket Cross Correct Tag\n"); | |
printf(" ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy\n"); | |
printf("============================================================================\n"); | |
} | |
/*-----------------------------------------------*/ | |
/* initialization at each individual computation */ | |
/*-----------------------------------------------*/ | |
void | |
init() | |
{ | |
int i; | |
wn1 = 0; | |
wn2 = 0; | |
bn1 = 0; | |
bn2 = 0; | |
r_bn1 = 0; | |
r_bn2 = 0; | |
for(i=0;i<MAX_WORD_IN_SENT;i++){ | |
terminal1[i].word[0] = '\0'; | |
terminal1[i].label[0] = '\0'; | |
terminal1[i].result = 9; | |
terminal2[i].word[0] = '\0'; | |
terminal2[i].label[0] = '\0'; | |
terminal2[i].result = 9; | |
} | |
for(i=0;i<MAX_QUOTE_TERM;i++){ | |
quotterm1[i].term.word[0] = '\0'; | |
quotterm1[i].term.label[0] = '\0'; | |
quotterm1[i].term.result = 9; | |
quotterm1[i].index = -1; | |
quotterm1[i].bracket = -1; | |
quotterm2[i].term.word[0] = '\0'; | |
quotterm2[i].term.label[0] = '\0'; | |
quotterm2[i].term.result = 9; | |
quotterm2[i].index = -1; | |
quotterm2[i].bracket = -1; | |
} | |
for(i=0;i<MAX_BRACKET_IN_SENT;i++){ | |
bracket1[i].start = -1; | |
bracket1[i].end = -1; | |
bracket1[i].label[0] = '\0'; | |
bracket1[i].result = 9; | |
bracket2[i].start = -1; | |
bracket2[i].end = -1; | |
bracket2[i].label[0] = '\0'; | |
bracket2[i].result = 9; | |
} | |
Status = 0; | |
} | |
/*----------------*/ | |
/* parameter file */ | |
/*----------------*/ | |
void | |
read_parameter_file(filename) | |
char *filename; | |
{ | |
char buff[MAX_LINE_LEN]; | |
FILE *fd; | |
int line; | |
int i; | |
if((fd=fopen(filename,"r"))==NULL){ | |
Fatal("Can't open parameter file (%s)\n",filename); | |
} | |
for(line=1;fgets(buff,MAX_LINE_LEN,fd)!=NULL;line++){ | |
/* clean up the tail and find unvalid line */ | |
/*-----------------------------------------*/ | |
for(i=strlen(buff)-1;i>0 && (isspace(buff[i]) || buff[i]=='\n');i--){ | |
buff[i]='\0'; | |
} | |
if(buff[0]=='#' || /* comment-line */ | |
strlen(buff)<3){ /* too short, just ignore */ | |
continue; | |
} | |
/* place the parameter and value */ | |
/*-------------------------------*/ | |
for(i=0;!isspace(buff[i]);i++); | |
for(;isspace(buff[i]) && buff[i]!='\0';i++); | |
if(buff[i]=='\0'){ | |
fprintf(stderr,"Empty value in parameter file (%d)\n",line); | |
} | |
/* set parameter and value */ | |
/*-------------------------*/ | |
set_param(buff,buff+i); | |
} | |
fclose(fd); | |
} | |
void | |
set_param(param,value) | |
char *param, *value; | |
{ | |
char l1[MAX_LABEL_LEN], l2[MAX_LABEL_LEN]; | |
if(STRNCMP("DEBUG")){ | |
DEBUG = atoi(value); | |
}else if(STRNCMP("MAX_ERROR")){ | |
Max_error = atoi(value); | |
}else if(STRNCMP("CUTOFF_LEN")){ | |
TOT_cut_len = atoi(value); | |
}else if(STRNCMP("LABELED")){ | |
F_label = atoi(value); | |
}else if(STRNCMP("DELETE_LABEL")){ | |
Delete_label[Delete_label_n] = (char *)malloc(strlen(value)+1); | |
strcpy(Delete_label[Delete_label_n],value); | |
Delete_label_n++; | |
}else if(STRNCMP("DELETE_LABEL_FOR_LENGTH")){ | |
Delete_label_for_length[Delete_label_for_length_n] = (char *)malloc(strlen(value)+1); | |
strcpy(Delete_label_for_length[Delete_label_for_length_n],value); | |
Delete_label_for_length_n++; | |
}else if(STRNCMP("QUOTE_LABEL")){ | |
Quote_term[Quote_term_n] = (char *)malloc(strlen(value)+1); | |
strcpy(Quote_term[Quote_term_n],value); | |
Quote_term_n++; | |
}else if(STRNCMP("EQ_LABEL")){ | |
if(narg(value)!=2){ | |
fprintf(stderr,"EQ_LABEL requires two values\n"); | |
return; | |
} | |
sscanf(value,"%s %s",l1,l2); | |
EQ_label[EQ_label_n].s1 = (char *)malloc(strlen(l1)+1); | |
strcpy(EQ_label[EQ_label_n].s1,l1); | |
EQ_label[EQ_label_n].s2 = (char *)malloc(strlen(l2)+1); | |
strcpy(EQ_label[EQ_label_n].s2,l2); | |
EQ_label_n++; | |
}else if(STRNCMP("EQ_WORD")){ | |
if(narg(value)!=2){ | |
fprintf(stderr,"EQ_WORD requires two values\n"); | |
return; | |
} | |
sscanf(value,"%s %s",l1,l2); | |
EQ_word[EQ_word_n].s1 = (char *)malloc(strlen(l1)+1); | |
strcpy(EQ_word[EQ_word_n].s1,l1); | |
EQ_word[EQ_word_n].s2 = (char *)malloc(strlen(l2)+1); | |
strcpy(EQ_word[EQ_word_n].s2,l2); | |
EQ_word_n++; | |
}else{ | |
fprintf(stderr,"Unknown keyword (%s) in parameter file\n",param); | |
} | |
} | |
int | |
narg(s) | |
char *s; | |
{ | |
int n; | |
for(n=0;*s!='\0';){ | |
for(;isspace(*s);s++); | |
if(*s=='\0'){ | |
break; | |
} | |
n++; | |
for(;!isspace(*s);s++){ | |
if(*s=='\0'){ | |
break; | |
} | |
} | |
} | |
return(n); | |
} | |
/*-----------------------------*/ | |
/* Read line and gather data. */ | |
/* Return langth of sentence. */ | |
/*-----------------------------*/ | |
int | |
read_line(buff, terminal, quotterm, wn, bracket, bn) | |
char *buff; | |
s_terminal terminal[]; | |
s_term_ind quotterm[]; | |
int *wn; | |
s_bracket bracket[]; | |
int *bn; | |
{ | |
char *p, *q, label[MAX_LABEL_LEN], word[MAX_WORD_LEN]; | |
int qt; /* quote term counter */ | |
int wid, bid; /* word ID, bracket ID */ | |
int n; /* temporary remembering the position */ | |
int b; /* temporary remembering bid */ | |
int i; | |
int len; /* length of the sentence */ | |
len = 0; | |
stack_top=0; | |
for(p=buff,qt=0,wid=0,bid=0;*p!='\0';){ | |
if(isspace(*p)){ | |
p++; | |
continue; | |
/* open bracket */ | |
/*--------------*/ | |
}else if(*p=='('){ | |
n=wid; | |
for(p++,i=0;!is_terminator(*p);p++,i++){ | |
label[i]=*p; | |
} | |
label[i]='\0'; | |
/* Find terminals */ | |
q = p; | |
if(isspace(*q)){ | |
for(q++;isspace(*q);q++); | |
for(i=0;!is_terminator(*q);q++,i++){ | |
word[i]=*q; | |
} | |
word[i]='\0'; | |
/* compute length */ | |
if(*q==')' && !is_deletelabel_for_length(label)==1){ | |
len++; | |
} | |
if (DEBUG>1) | |
printf("label=%s, word=%s, wid=%d\n",label,word,wid); | |
/* quote terminal */ | |
if(*q==')' && is_quote_term(label,word)==1){ | |
strcpy(quotterm[qt].term.word,word); | |
strcpy(quotterm[qt].term.label,label); | |
quotterm[qt].index = wid; | |
quotterm[qt].bracket = bid; | |
quotterm[qt].endslen = stack_top; | |
//quotterm[qt].ends = (int*)malloc(stack_top*sizeof(int)); | |
memcpy(quotterm[qt].ends,stack,stack_top*sizeof(int)); | |
qt++; | |
} | |
/* delete terminal */ | |
if(*q==')' && is_deletelabel(label)==1){ | |
p = q+1; | |
continue; | |
/* valid terminal */ | |
}else if(*q==')'){ | |
strcpy(terminal[wid].word,word); | |
strcpy(terminal[wid].label,label); | |
wid++; | |
p = q+1; | |
continue; | |
/* error */ | |
}else if(*q!='('){ | |
Error("More than two elements in a bracket\n"); | |
} | |
} | |
/* otherwise non-terminal label */ | |
bracket[bid].start = wid; | |
bracket[bid].buf_start = p-buff; | |
strcpy(bracket[bid].label,label); | |
pushb(bid); | |
bid++; | |
/* close bracket */ | |
/*---------------*/ | |
}else if(*p==')'){ | |
b = popb(); | |
bracket[b].end = wid; | |
bracket[b].buf_end = p-buff; | |
p++; | |
/* error */ | |
/*-------*/ | |
}else{ | |
Error("Reading sentence\n"); | |
} | |
} | |
if(!stackempty()){ | |
Error("Bracketing is unbalanced (too many open bracket)\n"); | |
} | |
*wn = wid; | |
*bn = bid; | |
return(len); | |
} | |
/*----------------------*/ | |
/* stack operation */ | |
/* for bracketing pairs */ | |
/*----------------------*/ | |
void | |
pushb(item) | |
int item; | |
{ | |
stack[stack_top++]=item; | |
} | |
int | |
popb() | |
{ | |
int item; | |
item = stack[stack_top-1]; | |
if(stack_top-- < 0){ | |
Error("Bracketing unbalance (too many close bracket)\n"); | |
} | |
return(item); | |
} | |
int | |
stackempty() | |
{ | |
if(stack_top==0){ | |
return(1); | |
}else{ | |
return(0); | |
} | |
} | |
/*------------------*/ | |
/* calculate result */ | |
/*------------------*/ | |
void | |
calc_result(unsigned char *buf1,unsigned char *buf) | |
{ | |
int i, j, l; | |
int match, crossing, correct_tag; | |
int last_i = -1; | |
char my_buf[1000]; | |
int match_found = 0; | |
char match_j[200]; | |
for (j = 0; j < bn2; ++j) { | |
match_j[j] = 0; | |
} | |
/* ML */ | |
if (DEBUG>1) | |
printf("\n"); | |
/* Find skip and error */ | |
/*---------------------*/ | |
if(wn2==0){ | |
Status = 2; | |
individual_result(0,0,0,0,0,0); | |
return; | |
} | |
if(wn1 != wn2){ | |
//if (DEBUG>1) | |
//Error("Length unmatch (%d|%d)\n",wn1,wn2); | |
fix_quote(); | |
if(wn1 != wn2){ | |
Error("Length unmatch (%d|%d)\n",wn1,wn2); | |
individual_result(0,0,0,0,0,0); | |
return; | |
} | |
} | |
for(i=0;i<wn1;i++){ | |
if(word_comp(terminal1[i].word,terminal2[i].word)==0){ | |
Error("Words unmatch (%s|%s)\n",terminal1[i].word, | |
terminal2[i].word); | |
individual_result(0,0,0,0,0,0); | |
return; | |
} | |
} | |
/* massage the data */ | |
/*------------------*/ | |
massage_data(); | |
/* matching brackets */ | |
/*-------------------*/ | |
match = 0; | |
for(i=0;i<bn1;i++){ | |
for(j=0;j<bn2;j++){ | |
if (DEBUG>1) | |
printf("1.res=%d, 2.res=%d, 1.start=%d, 2.start=%d, 1.end=%d, 2.end=%d\n",bracket1[i].result,bracket2[j].result,bracket1[i].start,bracket2[j].start,bracket1[i].end,bracket2[j].end); | |
// does bracket match? | |
if(bracket1[i].result != 5 && | |
bracket2[j].result == 0 && | |
bracket1[i].start == bracket2[j].start && bracket1[i].end == bracket2[j].end) { | |
// (1) do we not care about the label or (2) does the label match? | |
if (F_label==0 || label_comp(bracket1[i].label,bracket2[j].label)==1) { | |
bracket1[i].result = bracket2[j].result = 1; | |
match++; | |
match_found = 1; | |
break; | |
} else { | |
if (DEBUG>1) { | |
printf(" LABEL[%d-%d]: ",bracket1[i].start,bracket1[i].end-1); | |
l = bracket1[i].buf_end-bracket1[i].buf_start; | |
strncpy(my_buf,buf1+bracket1[i].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
} | |
match_found = 1; | |
match_j[j] = 1; | |
} | |
} | |
} | |
if (!match_found && bracket1[i].result != 5 && DEBUG>1) { | |
/* ### ML 09/28/03: gold bracket with no corresponding test bracket */ | |
printf(" BRACKET[%d-%d]: ",bracket1[i].start,bracket1[i].end-1); | |
l = bracket1[i].buf_end-bracket1[i].buf_start; | |
strncpy(my_buf,buf1+bracket1[i].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
} | |
match_found = 0; | |
} | |
for(j=0;j<bn2;j++){ | |
if (bracket2[j].result==0 && !match_j[j] && DEBUG>1) { | |
/* test bracket with no corresponding gold bracket */ | |
printf(" EXTRA[%d-%d]: ",bracket2[j].start,bracket2[j].end-1); | |
l = bracket2[j].buf_end-bracket2[j].buf_start; | |
strncpy(my_buf,buf+bracket2[j].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
} | |
} | |
/* crossing */ | |
/*----------*/ | |
crossing = 0; | |
/* crossing is counted based on the brackets */ | |
/* in test rather than gold file (by Mike) */ | |
for(j=0;j<bn2;j++){ | |
for(i=0;i<bn1;i++){ | |
if(bracket1[i].result != 5 && | |
bracket2[j].result != 5 && | |
((bracket1[i].start < bracket2[j].start && | |
bracket1[i].end > bracket2[j].start && | |
bracket1[i].end < bracket2[j].end) || | |
(bracket1[i].start > bracket2[j].start && | |
bracket1[i].start < bracket2[j].end && | |
bracket1[i].end > bracket2[j].end))){ | |
/* ### ML 09/01/03: get details on cross-brackettings */ | |
if (i != last_i) { | |
if (DEBUG>1) { | |
printf(" CROSSING[%d-%d]: ",bracket1[i].start,bracket1[i].end-1); | |
l = bracket1[i].buf_end-bracket1[i].buf_start; | |
strncpy(my_buf,buf1+bracket1[i].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
/* ML | |
printf("\n CROSSING at bracket %d:\n",i-1); | |
printf(" GOLD (tokens %d-%d): ",bracket1[i].start,bracket1[i].end-1); | |
l = bracket1[i].buf_end-bracket1[i].buf_start; | |
strncpy(my_buf,buf1+bracket1[i].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
*/ | |
} | |
last_i = i; | |
} | |
/* ML | |
printf(" TEST (tokens %d-%d): ",bracket2[j].start,bracket2[j].end-1); | |
l = bracket2[j].buf_end-bracket2[j].buf_start; | |
strncpy(my_buf,buf+bracket2[j].buf_start,l); | |
my_buf[l] = '\0'; | |
printf("%s\n",my_buf); | |
*/ | |
crossing++; | |
break; | |
} | |
} | |
} | |
/* Tagging accuracy */ | |
/*------------------*/ | |
correct_tag=0; | |
for(i=0;i<wn1;i++){ | |
if(label_comp(terminal1[i].label,terminal2[i].label)==1){ | |
terminal1[i].result = terminal2[i].result = 1; | |
correct_tag++; | |
} else { | |
terminal1[i].result = terminal2[i].result = 0; | |
} | |
} | |
individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag); | |
} | |
void | |
fix_quote() | |
{ | |
int i,j,k; | |
if (DEBUG>1) { | |
for(i=0;i<MAX_QUOTE_TERM;i++){ | |
if (quotterm1[i].index!=-1) | |
printf("%d: %s - %s\n",quotterm1[i].index, | |
quotterm1[i].term.label, | |
quotterm1[i].term.word); | |
if (quotterm2[i].index!=-1) | |
printf("%d: %s - %s\n",quotterm2[i].index, | |
quotterm2[i].term.label, | |
quotterm2[i].term.word); | |
} | |
} | |
for(i=0;i<MAX_QUOTE_TERM;i++) { | |
int ind = quotterm2[i].index; | |
if (ind!=-1) { | |
for(j=0;j<MAX_QUOTE_TERM;j++){ | |
if (quotterm1[j].index==ind && | |
strcmp(quotterm1[j].term.label, | |
quotterm2[i].term.label)!=0) { | |
if (is_deletelabel(quotterm1[j].term.label) && !is_deletelabel(quotterm2[i].term.label)) { | |
reinsert_term("term1[j],terminal1,bracket1,&wn1); | |
for (k=j;k<MAX_QUOTE_TERM;k++) | |
if (quotterm1[k].index!=-1) | |
quotterm1[k].index++; | |
} else if (is_deletelabel(quotterm2[i].term.label) && !is_deletelabel(quotterm1[j].term.label)) { | |
reinsert_term("term2[i],terminal2,bracket2,&wn2); | |
for (k=i;k<MAX_QUOTE_TERM;k++) | |
if (quotterm2[k].index!=-1) | |
quotterm2[k].index++; | |
} | |
} | |
} | |
} else break; | |
} | |
} | |
void | |
reinsert_term(quot,terminal,bracket,wn) | |
s_term_ind* quot; | |
s_terminal terminal[]; | |
s_bracket bracket[]; | |
int* wn; | |
{ | |
int ind = quot->index; | |
int bra = quot->bracket; | |
s_terminal* term = "->term; | |
int k; | |
memmove(&terminal[ind+1], | |
&terminal[ind], | |
sizeof(s_terminal)*(MAX_WORD_IN_SENT-ind-1)); | |
strcpy(terminal[ind].label,term->label); | |
strcpy(terminal[ind].word,term->word); | |
(*wn)++; | |
if (DEBUG>1) | |
printf("bra=%d, ind=%d\n",bra,ind); | |
for(k=0;k<MAX_BRACKET_IN_SENT;k++) { | |
if (bracket[k].start==-1) | |
break; | |
if (DEBUG>1) | |
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end); | |
if (k>=bra) { | |
bracket[k].start++; | |
bracket[k].end++; | |
} | |
//if (bracket[k].start<=ind && bracket[k].end>=ind) | |
//bracket[k].end++; | |
} | |
if (DEBUG>1) | |
printf("endslen=%d\n",quot->endslen); | |
for(k=0;k<quot->endslen;k++) { | |
//printf("ends[%d]=%d",k,quot->ends[k]); | |
bracket[quot->ends[k]].end++; | |
} | |
//free(quot->ends); | |
} | |
/* | |
void | |
adjust_end(ind,bra) | |
int ind; | |
int bra; | |
{ | |
for(k=0;k<MAX_BRACKET_IN_SENT;k++) { | |
if (bracket[k].start==-1) | |
break; | |
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end); | |
if (k>=bra) | |
bracket[k].end++; | |
} | |
} | |
*/ | |
void | |
massage_data() | |
{ | |
int i, j; | |
/* for GOLD */ | |
/*----------*/ | |
for(i=0;i<bn1;i++){ | |
bracket1[i].result = 0; | |
/* Zero element */ | |
if(bracket1[i].start == bracket1[i].end){ | |
bracket1[i].result = 5; | |
continue; | |
} | |
/* Modify label */ | |
modify_label(bracket1[i].label); | |
/* Delete label */ | |
for(j=0;j<Delete_label_n;j++){ | |
if(label_comp(bracket1[i].label,Delete_label[j])==1){ | |
bracket1[i].result = 5; | |
} | |
} | |
} | |
/* for TEST */ | |
/*----------*/ | |
for(i=0;i<bn2;i++){ | |
bracket2[i].result = 0; | |
/* Zero element */ | |
if(bracket2[i].start == bracket2[i].end){ | |
bracket2[i].result = 5; | |
continue; | |
} | |
/* Modify label */ | |
modify_label(bracket2[i].label); | |
/* Delete label */ | |
for(j=0;j<Delete_label_n;j++){ | |
if(label_comp(bracket2[i].label,Delete_label[j])==1){ | |
bracket2[i].result = 5; | |
} | |
} | |
} | |
/* count up real number of brackets (exclude deleted ones) */ | |
/*---------------------------------------------------------*/ | |
r_bn1 = r_bn2 = 0; | |
for(i=0;i<bn1;i++){ | |
if(bracket1[i].result != 5){ | |
r_bn1++; | |
} | |
} | |
for(i=0;i<bn2;i++){ | |
if(bracket2[i].result != 5){ | |
r_bn2++; | |
} | |
} | |
} | |
/*------------------------*/ | |
/* trim the tail of label */ | |
/*------------------------*/ | |
void | |
modify_label(label) | |
char *label; | |
{ | |
char *p; | |
for(p=label;*p!='\0';p++){ | |
if(*p=='-' || *p=='='){ | |
*p='\0'; | |
break; | |
} | |
} | |
} | |
/*-----------------------------------------------*/ | |
/* add individual statistics to TOTAL statictics */ | |
/*-----------------------------------------------*/ | |
void | |
individual_result(wn1,bn1,bn2,match,crossing,correct_tag) | |
int wn1,bn1,bn2,match,crossing,correct_tag; | |
{ | |
/* Statistics for ALL */ | |
/*--------------------*/ | |
TOTAL_sent++; | |
if(Status==1){ | |
TOTAL_error_sent++; | |
}else if(Status==2){ | |
TOTAL_skip_sent++; | |
}else{ | |
TOTAL_bn1 += bn1; | |
TOTAL_bn2 += bn2; | |
TOTAL_match += match; | |
if(bn1==bn2 && bn2==match){ | |
TOTAL_comp_sent++; | |
} | |
TOTAL_word += wn1; | |
TOTAL_crossing += crossing; | |
if(crossing==0){ | |
TOTAL_no_crossing++; | |
} | |
if(crossing <= 2){ | |
TOTAL_2L_crossing++; | |
} | |
TOTAL_correct_tag += correct_tag; | |
} | |
/* Statistics for sent length <= TOT_cut_len */ | |
/*-------------------------------------------*/ | |
if(r_wn1<=TOT_cut_len){ | |
TOT40_sent++; | |
if(Status==1){ | |
TOT40_error_sent++; | |
}else if(Status==2){ | |
TOT40_skip_sent++; | |
}else{ | |
TOT40_bn1 += bn1; | |
TOT40_bn2 += bn2; | |
TOT40_match += match; | |
if(bn1==bn2 && bn2==match){ | |
TOT40_comp_sent++; | |
} | |
TOT40_word += wn1; | |
TOT40_crossing += crossing; | |
if(crossing==0){ | |
TOT40_no_crossing++; | |
} | |
if(crossing <= 2){ | |
TOT40_2L_crossing++; | |
} | |
TOT40_correct_tag += correct_tag; | |
} | |
} | |
/* Print individual result */ | |
/*-------------------------*/ | |
printf("%4d %3d %d ",Line,r_wn1,Status); | |
printf("%6.2f %6.2f %3d %3d %3d %3d", | |
(r_bn1==0?0.0:100.0*match/r_bn1), | |
(r_bn2==0?0.0:100.0*match/r_bn2), | |
match, r_bn1, r_bn2, crossing); | |
printf(" %4d %4d %6.2f\n",wn1,correct_tag, | |
(wn1==0?0.0:100.0*correct_tag/wn1)); | |
} | |
/*------------------------*/ | |
/* print total statistics */ | |
/*------------------------*/ | |
void | |
print_total() | |
{ | |
int sentn; | |
double r,p,f; | |
printf("============================================================================\n"); | |
if(TOTAL_bn1>0 && TOTAL_bn2>0){ | |
printf(" %6.2f %6.2f %6d %5d %5d %5d", | |
(TOTAL_bn1>0?100.0*TOTAL_match/TOTAL_bn1:0.0), | |
(TOTAL_bn2>0?100.0*TOTAL_match/TOTAL_bn2:0.0), | |
TOTAL_match, | |
TOTAL_bn1, | |
TOTAL_bn2, | |
TOTAL_crossing); | |
} | |
printf(" %5d %5d %6.2f", | |
TOTAL_word, | |
TOTAL_correct_tag, | |
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0)); | |
printf("\n"); | |
printf("=== Summary ===\n"); | |
sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent; | |
printf("\n-- All --\n"); | |
printf("Number of sentence = %6d\n",TOTAL_sent); | |
printf("Number of Error sentence = %6d\n",TOTAL_error_sent); | |
printf("Number of Skip sentence = %6d\n",TOTAL_skip_sent); | |
printf("Number of Valid sentence = %6d\n",sentn); | |
r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0; | |
printf("Bracketing Recall = %6.2f\n",r); | |
p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0; | |
printf("Bracketing Precision = %6.2f\n",p); | |
f = 2*p*r/(p+r); | |
printf("Bracketing FMeasure = %6.2f\n",f); | |
printf("Complete match = %6.2f\n", | |
(sentn>0?100.0*TOTAL_comp_sent/sentn:0.0)); | |
printf("Average crossing = %6.2f\n", | |
(sentn>0?1.0*TOTAL_crossing/sentn:0.0)); | |
printf("No crossing = %6.2f\n", | |
(sentn>0?100.0*TOTAL_no_crossing/sentn:0.0)); | |
printf("2 or less crossing = %6.2f\n", | |
(sentn>0?100.0*TOTAL_2L_crossing/sentn:0.0)); | |
printf("Tagging accuracy = %6.2f\n", | |
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0)); | |
sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent; | |
printf("\n-- len<=%d --\n",TOT_cut_len); | |
printf("Number of sentence = %6d\n",TOT40_sent); | |
printf("Number of Error sentence = %6d\n",TOT40_error_sent); | |
printf("Number of Skip sentence = %6d\n",TOT40_skip_sent); | |
printf("Number of Valid sentence = %6d\n",sentn); | |
r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0; | |
printf("Bracketing Recall = %6.2f\n",r); | |
p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0; | |
printf("Bracketing Precision = %6.2f\n",p); | |
f = 2*p*r/(p+r); | |
printf("Bracketing FMeasure = %6.2f\n",f); | |
printf("Complete match = %6.2f\n", | |
(sentn>0?100.0*TOT40_comp_sent/sentn:0.0)); | |
printf("Average crossing = %6.2f\n", | |
(sentn>0?1.0*TOT40_crossing/sentn:0.0)); | |
printf("No crossing = %6.2f\n", | |
(sentn>0?100.0*TOT40_no_crossing/sentn:0.0)); | |
printf("2 or less crossing = %6.2f\n", | |
(sentn>0?100.0*TOT40_2L_crossing/sentn:0.0)); | |
printf("Tagging accuracy = %6.2f\n", | |
(TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0)); | |
} | |
/*--------------------------------*/ | |
/* display individual information */ | |
/*--------------------------------*/ | |
void | |
dsp_info() | |
{ | |
int i, n; | |
printf("-<1>---(wn1=%3d, bn1=%3d)- ",wn1,bn1); | |
printf("-<2>---(wn2=%3d, bn2=%3d)-\n",wn2,bn2); | |
n = (wn1>wn2?wn1:wn2); | |
for(i=0;i<n;i++){ | |
if(terminal1[i].word[0]!='\0'){ | |
printf("%3d : %d : %-6s %-16s ",i,terminal1[i].result, | |
terminal1[i].label,terminal1[i].word); | |
}else{ | |
printf(" "); | |
} | |
if(terminal2[i].word[0]!='\0'){ | |
printf("%3d : %d : %-6s %-16s\n",i,terminal2[i].result, | |
terminal2[i].label,terminal2[i].word); | |
}else{ | |
printf("\n"); | |
} | |
} | |
printf("\n"); | |
n = (bn1>bn2?bn1:bn2); | |
for(i=0;i<n;i++){ | |
if(bracket1[i].start != -1){ | |
printf("%3d : %d : %3d %3d %-6s ",i,bracket1[i].result, | |
bracket1[i].start,bracket1[i].end, | |
bracket1[i].label); | |
} else { | |
printf(" "); | |
} | |
if(bracket2[i].start != -1){ | |
printf("%3d : %d : %3d %3d %-6s\n",i,bracket2[i].result, | |
bracket2[i].start,bracket2[i].end, | |
bracket2[i].label); | |
} else { | |
printf("\n"); | |
} | |
} | |
printf("\n"); | |
printf("========\n"); | |
} | |
/*-----------------*/ | |
/* some predicates */ | |
/*-----------------*/ | |
int | |
is_terminator(c) | |
char c; | |
{ | |
if(isspace(c) || c=='(' || c==')'){ | |
return(1); | |
}else{ | |
return(0); | |
} | |
} | |
int | |
is_deletelabel(s) | |
char *s; | |
{ | |
int i; | |
for(i=0;i<Delete_label_n;i++){ | |
if(strcmp(s,Delete_label[i])==0){ | |
return(1); | |
} | |
} | |
return(0); | |
} | |
int | |
is_deletelabel_for_length(s) | |
char *s; | |
{ | |
int i; | |
for(i=0;i<Delete_label_for_length_n;i++){ | |
if(strcmp(s,Delete_label_for_length[i])==0){ | |
return(1); | |
} | |
} | |
return(0); | |
} | |
int | |
is_quote_term(s,w) | |
char *s; | |
char *w; | |
{ | |
int i; | |
for(i=0;i<Quote_term_n;i++){ | |
if(strcmp(s,Quote_term[i])==0){ | |
if (strcmp(w,"'")==0 || strcmp(w,"\"")==0 || strcmp(w,"/")==0) | |
return(1); | |
} | |
} | |
return(0); | |
} | |
/*---------------*/ | |
/* compare words */ | |
/*---------------*/ | |
int | |
word_comp(s1,s2) | |
char *s1,*s2; | |
{ | |
int i; | |
if(strcmp(s1,s2)==0){ | |
return(1); | |
} | |
for(i=0;i<EQ_word_n;i++){ | |
if((strcmp(s1,EQ_word[i].s1)==0 && | |
strcmp(s2,EQ_word[i].s2)==0) || | |
(strcmp(s1,EQ_word[i].s2)==0 && | |
strcmp(s2,EQ_word[i].s1)==0)){ | |
return(1); | |
} | |
} | |
return(0); | |
} | |
/*----------------*/ | |
/* compare labels */ | |
/*----------------*/ | |
int | |
label_comp(s1,s2) | |
char *s1,*s2; | |
{ | |
int i; | |
if(strcmp(s1,s2)==0){ | |
return(1); | |
} | |
for(i=0;i<EQ_label_n;i++){ | |
if((strcmp(s1,EQ_label[i].s1)==0 && | |
strcmp(s2,EQ_label[i].s2)==0) || | |
(strcmp(s1,EQ_label[i].s2)==0 && | |
strcmp(s2,EQ_label[i].s1)==0)){ | |
return(1); | |
} | |
} | |
return(0); | |
} | |
/*--------*/ | |
/* errors */ | |
/*--------*/ | |
void | |
Error(s,arg1,arg2,arg3) | |
char *s, *arg1, *arg2, *arg3; | |
{ | |
Status = 1; | |
fprintf(stderr,"%d : ",Line); | |
fprintf(stderr,s,arg1,arg2,arg3); | |
if(Error_count++>Max_error){ | |
exit(1); | |
} | |
} | |
/*---------------------*/ | |
/* fatal error to exit */ | |
/*---------------------*/ | |
void | |
Fatal(s,arg1,arg2,arg3) | |
char *s, *arg1, *arg2, *arg3; | |
{ | |
fprintf(stderr,s,arg1,arg2,arg3); | |
exit(1); | |
} | |
/*-------*/ | |
/* Usage */ | |
/*-------*/ | |
void | |
Usage() | |
{ | |
fprintf(stderr," evalb [-dDh][-c n][-e n][-p param_file] gold-file test-file \n"); | |
fprintf(stderr," \n"); | |
fprintf(stderr," Evaluate bracketing in test-file against gold-file. \n"); | |
fprintf(stderr," Return recall, precision, F-Measure, tag accuracy. \n"); | |
fprintf(stderr," \n"); | |
fprintf(stderr," <option> \n"); | |
fprintf(stderr," -d debug mode \n"); | |
fprintf(stderr," -D debug mode plus bracketing info \n"); | |
fprintf(stderr," -c n cut-off length forstatistics (def.=40)\n"); | |
fprintf(stderr," -e n number of error to kill (default=10) \n"); | |
fprintf(stderr," -p param_file parameter file \n"); | |
fprintf(stderr," -h help \n"); | |
} | |