diff -Nurd -x'*~' uni2ascii-4.18.orig/ascii2uni.c uni2ascii-4.18/ascii2uni.c --- uni2ascii-4.18.orig/ascii2uni.c 2011-05-14 22:15:20.000000000 -0400 +++ uni2ascii-4.18/ascii2uni.c 2015-02-08 16:27:40.000000000 -0500 @@ -208,6 +208,7 @@ char aHfmt [8+2+1]; char aDfmt [8+2+1]; char cbuf[5]; + char fmt_itoa[12]; FILE *infp; UTF32 num; @@ -555,45 +556,64 @@ } else if (FType == CHENT) { if (AllHTMLP){ + NConsumed = -1; if(sscanf(iptr,aHfmt,&num,&NConsumed) > 0) { - if(*(iptr+NConsumed-1) != ';') { + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') { MicrosoftStyle++; + if (NConsumed == -1) { + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%lx", num) > sizeof(fmt_itoa)-1) { + fprintf(stderr, "UTF32 codepoint overflowed static buffer\n"); + exit(BADRECORD); + } + NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" */; + } fprintf(stderr, _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu lacks the requisite final semicolon.\n"), ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo); if(StrictP) {putchar(*iptr++); continue;} - else {putu8(num);iptr+=NConsumed;} + else {putu8(num);iptr+=NConsumed-1;} } else {putu8(num);iptr+=NConsumed;} TokenNumber++; continue; } + NConsumed = -1; if(sscanf(iptr,aDfmt,&num,&NConsumed) > 0) { - if(*(iptr+NConsumed-1) != ';') { + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') { MicrosoftStyle++; + if (NConsumed == -1) { + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%lu", num) > sizeof(fmt_itoa)-1) { + fprintf(stderr, "UTF32 codepoint overflowed static buffer\n"); + exit(BADRECORD); + } + NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */; + } fprintf(stderr, _("The HTML/HDML entity %1$s at token %2$lu of line %3$lu lacks the requisite final semicolon.\n"), ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo); if (StrictP) {putchar(*iptr++); continue;} - else {putu8(num);iptr+=NConsumed;} + else {putu8(num);iptr+=NConsumed-1;} } else {putu8(num);iptr+=NConsumed;} TokenNumber++; continue; } } + NConsumed = -1; if(sscanf(iptr,afmt,&enam,&NConsumed) > 0) { + if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 /* ";" */; if( (num = LookupCodeForEntity(enam))) { if(*(iptr+NConsumed-1) != ';') { MicrosoftStyle++; fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu of line %3$lu lacks the requisite final semicolon.\n"),ExtractSubstring(tmpstr,iptr,iptr+NConsumed-3),TokenNumber,LineNo); if(StrictP) {putchar(*iptr++);continue;} - else {putu8(num);iptr+=NConsumed;} + else {putu8(num);iptr+=NConsumed-1;} } else {putu8(num);iptr+=NConsumed;} TokenNumber++; } else { + if(*(iptr+NConsumed-1) != ';') NConsumed--; fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity \"&%s;\" at line %lu\n", enam,LineNo); putu8(UNI_REPLACEMENT_CHAR); diff -Nurd -x'*~' uni2ascii-4.18.orig/enttbl.c uni2ascii-4.18/enttbl.c --- uni2ascii-4.18.orig/enttbl.c 2011-02-18 11:36:27.000000000 -0500 +++ uni2ascii-4.18/enttbl.c 2015-02-08 15:53:08.000000000 -0500 @@ -20,6 +20,7 @@ #include "config.h" #include +#include #include "unicode.h" struct ent { diff -Nurd -x'*~' uni2ascii-4.18.orig/uni2ascii.c uni2ascii-4.18/uni2ascii.c --- uni2ascii-4.18.orig/uni2ascii.c 2011-05-14 22:03:13.000000000 -0400 +++ uni2ascii-4.18/uni2ascii.c 2015-02-08 16:20:41.000000000 -0500 @@ -68,6 +68,8 @@ #include #endif +extern void putu8 (unsigned long); + void ShowVersion(FILE *fp) { @@ -2679,6 +2681,7 @@ * search may be advisable. For the time being we just do a linear search. */ +int SubstituteChar(UTF32 c) { int i; for(i = 0; i < SubCnt; i++) { @@ -2693,6 +2696,7 @@ return 0; } +void AddCustomSubstitution(char *str){ char *Left; char *Right;