/* This file is part of libmspack. * (C) 2003-2023 Stuart Caie. * * libmspack is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License (LGPL) version 2.1 * * For further details, see the file COPYING.LIB distributed with libmspack */ /* CHM decompression implementation */ #include #include /* prototypes */ static struct mschmd_header * chmd_open( struct mschm_decompressor *base, const char *filename); static struct mschmd_header * chmd_fast_open( struct mschm_decompressor *base, const char *filename); static struct mschmd_header *chmd_real_open( struct mschm_decompressor *base, const char *filename, int entire); static void chmd_close( struct mschm_decompressor *base, struct mschmd_header *chm); static int chmd_read_headers( struct mspack_system *sys, struct mspack_file *fh, struct mschmd_header *chm, int entire); static int chmd_fast_find( struct mschm_decompressor *base, struct mschmd_header *chm, const char *filename, struct mschmd_file *f_ptr, int f_size); static unsigned char *read_chunk( struct mschm_decompressor_p *self, struct mschmd_header *chm, struct mspack_file *fh, unsigned int chunk); static int search_chunk( struct mschmd_header *chm, const unsigned char *chunk, const char *filename, const unsigned char **result, const unsigned char **result_end); static inline int compare( const char *s1, const char *s2, int l1, int l2); static int chmd_extract( struct mschm_decompressor *base, struct mschmd_file *file, const char *filename); static int chmd_sys_write( struct mspack_file *file, void *buffer, int bytes); static int chmd_init_decomp( struct mschm_decompressor_p *self, struct mschmd_file *file); static int read_reset_table( struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, unsigned int entry, off_t *length_ptr, off_t *offset_ptr); static int read_spaninfo( struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, off_t *length_ptr); static int find_sys_file( struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, struct mschmd_file **f_ptr, const char *name); static unsigned char *read_sys_file( struct mschm_decompressor_p *self, struct mschmd_file *file); static int chmd_error( struct mschm_decompressor *base); static int read_off64( off_t *var, unsigned char *mem, struct mspack_system *sys, struct mspack_file *fh); static off_t read_encint( const unsigned char **p, const unsigned char *end, int *err); /* filenames of the system files used for decompression. * Content and ControlData are essential. * ResetTable is preferred, but SpanInfo can be used if not available */ static const char *content_name = "::DataSpace/Storage/MSCompressed/Content"; static const char *control_name = "::DataSpace/Storage/MSCompressed/ControlData"; static const char *spaninfo_name = "::DataSpace/Storage/MSCompressed/SpanInfo"; static const char *rtable_name = "::DataSpace/Storage/MSCompressed/Transform/" "{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"; /*************************************** * MSPACK_CREATE_CHM_DECOMPRESSOR *************************************** * constructor */ struct mschm_decompressor * mspack_create_chm_decompressor(struct mspack_system *sys) { struct mschm_decompressor_p *self = NULL; if (!sys) sys = mspack_default_system; if (!mspack_valid_system(sys)) return NULL; if ((self = (struct mschm_decompressor_p *) sys->alloc(sys, sizeof(struct mschm_decompressor_p)))) { self->base.open = &chmd_open; self->base.close = &chmd_close; self->base.extract = &chmd_extract; self->base.last_error = &chmd_error; self->base.fast_open = &chmd_fast_open; self->base.fast_find = &chmd_fast_find; self->system = sys; self->error = MSPACK_ERR_OK; self->d = NULL; } return (struct mschm_decompressor *) self; } /*************************************** * MSPACK_DESTROY_CAB_DECOMPRESSOR *************************************** * destructor */ void mspack_destroy_chm_decompressor(struct mschm_decompressor *base) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; if (self) { struct mspack_system *sys = self->system; if (self->d) { if (self->d->infh) sys->close(self->d->infh); if (self->d->state) lzxd_free(self->d->state); sys->free(self->d); } sys->free(self); } } /*************************************** * CHMD_OPEN *************************************** * opens a file and tries to read it as a CHM file. * Calls chmd_real_open() with entire=1. */ static struct mschmd_header *chmd_open(struct mschm_decompressor *base, const char *filename) { return chmd_real_open(base, filename, 1); } /*************************************** * CHMD_FAST_OPEN *************************************** * opens a file and tries to read it as a CHM file, but does not read * the file headers. Calls chmd_real_open() with entire=0 */ static struct mschmd_header *chmd_fast_open(struct mschm_decompressor *base, const char *filename) { return chmd_real_open(base, filename, 0); } /*************************************** * CHMD_REAL_OPEN *************************************** * the real implementation of chmd_open() and chmd_fast_open(). It simply * passes the "entire" parameter to chmd_read_headers(), which will then * either read all headers, or a bare mininum. */ static struct mschmd_header *chmd_real_open(struct mschm_decompressor *base, const char *filename, int entire) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; struct mschmd_header *chm = NULL; struct mspack_system *sys; struct mspack_file *fh; int error; if (!base) return NULL; sys = self->system; if ((fh = sys->open(sys, filename, MSPACK_SYS_OPEN_READ))) { if ((chm = (struct mschmd_header *) sys->alloc(sys, sizeof(struct mschmd_header)))) { chm->filename = filename; error = chmd_read_headers(sys, fh, chm, entire); if (error) { /* if the error is DATAFORMAT, and there are some results, return * partial results with a warning, rather than nothing */ if (error == MSPACK_ERR_DATAFORMAT && (chm->files || chm->sysfiles)) { sys->message(fh, "WARNING; contents are corrupt"); error = MSPACK_ERR_OK; } else { chmd_close(base, chm); chm = NULL; } } self->error = error; } else { self->error = MSPACK_ERR_NOMEMORY; } sys->close(fh); } else { self->error = MSPACK_ERR_OPEN; } return chm; } /*************************************** * CHMD_CLOSE *************************************** * frees all memory associated with a given mschmd_header */ static void chmd_close(struct mschm_decompressor *base, struct mschmd_header *chm) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; struct mschmd_file *fi, *nfi; struct mspack_system *sys; unsigned int i; if (!base) return; sys = self->system; self->error = MSPACK_ERR_OK; /* free files */ for (fi = chm->files; fi; fi = nfi) { nfi = fi->next; sys->free(fi); } for (fi = chm->sysfiles; fi; fi = nfi) { nfi = fi->next; sys->free(fi); } /* if this CHM was being decompressed, free decompression state */ if (self->d && (self->d->chm == chm)) { if (self->d->infh) sys->close(self->d->infh); if (self->d->state) lzxd_free(self->d->state); sys->free(self->d); self->d = NULL; } /* if this CHM had a chunk cache, free it and contents */ if (chm->chunk_cache) { for (i = 0; i < chm->num_chunks; i++) sys->free(chm->chunk_cache[i]); sys->free(chm->chunk_cache); } sys->free(chm); } /*************************************** * CHMD_READ_HEADERS *************************************** * reads the basic CHM file headers. If the "entire" parameter is * non-zero, all file entries will also be read. fills out a pre-existing * mschmd_header structure, allocates memory for files as necessary */ /* The GUIDs found in CHM headers */ static const unsigned char guids[32] = { /* {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} */ 0x10, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11, 0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC, /* {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} */ 0x11, 0xFD, 0x01, 0x7C, 0xAA, 0x7B, 0xD0, 0x11, 0x9E, 0x0C, 0x00, 0xA0, 0xC9, 0x22, 0xE6, 0xEC }; static int chmd_read_headers(struct mspack_system *sys, struct mspack_file *fh, struct mschmd_header *chm, int entire) { unsigned int errors, num_chunks; unsigned char buf[0x54], *chunk = NULL; const unsigned char *name, *p, *end; struct mschmd_file *fi, *link = NULL; off_t offset_hs0, filelen; int num_entries, err = 0; /* initialise pointers */ chm->files = NULL; chm->sysfiles = NULL; chm->chunk_cache = NULL; chm->sec0.base.chm = chm; chm->sec0.base.id = 0; chm->sec1.base.chm = chm; chm->sec1.base.id = 1; chm->sec1.content = NULL; chm->sec1.control = NULL; chm->sec1.spaninfo = NULL; chm->sec1.rtable = NULL; /* read the first header */ if (sys->read(fh, &buf[0], chmhead_SIZEOF) != chmhead_SIZEOF) { return MSPACK_ERR_READ; } /* check ITSF signature */ if (EndGetI32(&buf[chmhead_Signature]) != 0x46535449) { return MSPACK_ERR_SIGNATURE; } /* check both header GUIDs */ if (memcmp(&buf[chmhead_GUID1], &guids[0], 32L) != 0) { D(("incorrect GUIDs")) return MSPACK_ERR_SIGNATURE; } chm->version = EndGetI32(&buf[chmhead_Version]); chm->timestamp = EndGetM32(&buf[chmhead_Timestamp]); chm->language = EndGetI32(&buf[chmhead_LanguageID]); if (chm->version > 3) { sys->message(fh, "WARNING; CHM version > 3"); } /* read the header section table */ if (sys->read(fh, &buf[0], chmhst3_SIZEOF) != chmhst3_SIZEOF) { return MSPACK_ERR_READ; } /* chmhst3_OffsetCS0 does not exist in version 1 or 2 CHM files. * The offset will be corrected later, once HS1 is read. */ if (read_off64(&offset_hs0, &buf[chmhst_OffsetHS0], sys, fh) || read_off64(&chm->dir_offset, &buf[chmhst_OffsetHS1], sys, fh) || read_off64(&chm->sec0.offset, &buf[chmhst3_OffsetCS0], sys, fh)) { return MSPACK_ERR_DATAFORMAT; } /* seek to header section 0 */ if (sys->seek(fh, offset_hs0, MSPACK_SYS_SEEK_START)) { return MSPACK_ERR_SEEK; } /* read header section 0 */ if (sys->read(fh, &buf[0], chmhs0_SIZEOF) != chmhs0_SIZEOF) { return MSPACK_ERR_READ; } if (read_off64(&chm->length, &buf[chmhs0_FileLen], sys, fh)) { return MSPACK_ERR_DATAFORMAT; } /* compare declared CHM file size against actual size */ if (!mspack_sys_filelen(sys, fh, &filelen)) { if (chm->length > filelen) { sys->message(fh, "WARNING; file possibly truncated by %" LD " bytes", chm->length - filelen); } else if (chm->length < filelen) { sys->message(fh, "WARNING; possible %" LD " extra bytes at end of file", filelen - chm->length); } } /* seek to header section 1 */ if (sys->seek(fh, chm->dir_offset, MSPACK_SYS_SEEK_START)) { return MSPACK_ERR_SEEK; } /* read header section 1 */ if (sys->read(fh, &buf[0], chmhs1_SIZEOF) != chmhs1_SIZEOF) { return MSPACK_ERR_READ; } chm->dir_offset = sys->tell(fh); chm->chunk_size = EndGetI32(&buf[chmhs1_ChunkSize]); chm->density = EndGetI32(&buf[chmhs1_Density]); chm->depth = EndGetI32(&buf[chmhs1_Depth]); chm->index_root = EndGetI32(&buf[chmhs1_IndexRoot]); chm->num_chunks = EndGetI32(&buf[chmhs1_NumChunks]); chm->first_pmgl = EndGetI32(&buf[chmhs1_FirstPMGL]); chm->last_pmgl = EndGetI32(&buf[chmhs1_LastPMGL]); if (chm->version < 3) { /* versions before 3 don't have chmhst3_OffsetCS0 */ chm->sec0.offset = chm->dir_offset + (chm->chunk_size * chm->num_chunks); } /* check if content offset or file size is wrong */ if (chm->sec0.offset > chm->length) { D(("content section begins after file has ended")) return MSPACK_ERR_DATAFORMAT; } /* ensure there are chunks and that chunk size is * large enough for signature and num_entries */ if (chm->chunk_size < (pmgl_Entries + 2)) { D(("chunk size not large enough")) return MSPACK_ERR_DATAFORMAT; } if (chm->num_chunks == 0) { D(("no chunks")) return MSPACK_ERR_DATAFORMAT; } /* The chunk_cache data structure is not great; large values for num_chunks * or num_chunks*chunk_size can exhaust all memory. Until a better chunk * cache is implemented, put arbitrary limits on num_chunks and chunk size. */ if (chm->num_chunks > 100000) { D(("more than 100,000 chunks")) return MSPACK_ERR_DATAFORMAT; } if (chm->chunk_size > 8192) { D(("chunk size over 8192 (get in touch if this is valid)")) return MSPACK_ERR_DATAFORMAT; } if ((off_t)chm->chunk_size * (off_t)chm->num_chunks > chm->length) { D(("chunks larger than entire file")) return MSPACK_ERR_DATAFORMAT; } /* common sense checks on header section 1 fields */ if (chm->chunk_size != 4096) { sys->message(fh, "WARNING; chunk size is not 4096"); } if (chm->first_pmgl != 0) { sys->message(fh, "WARNING; first PMGL chunk is not zero"); } if (chm->first_pmgl > chm->last_pmgl) { D(("first pmgl chunk is after last pmgl chunk")) return MSPACK_ERR_DATAFORMAT; } if (chm->index_root != 0xFFFFFFFF && chm->index_root >= chm->num_chunks) { D(("index_root outside valid range")) return MSPACK_ERR_DATAFORMAT; } /* if we are doing a quick read, stop here! */ if (!entire) { return MSPACK_ERR_OK; } /* seek to the first PMGL chunk, and reduce the number of chunks to read */ if (chm->first_pmgl != 0) { off_t pmgl_offset = (off_t) chm->first_pmgl * (off_t) chm->chunk_size; if (sys->seek(fh, pmgl_offset, MSPACK_SYS_SEEK_CUR)) { return MSPACK_ERR_SEEK; } } num_chunks = chm->last_pmgl - chm->first_pmgl + 1; if (!(chunk = (unsigned char *) sys->alloc(sys, (size_t)chm->chunk_size))) { return MSPACK_ERR_NOMEMORY; } /* read and process all chunks from FirstPMGL to LastPMGL */ errors = 0; while (num_chunks--) { /* read next chunk */ if (sys->read(fh, chunk, (int)chm->chunk_size) != (int)chm->chunk_size) { sys->free(chunk); return MSPACK_ERR_READ; } /* process only directory (PMGL) chunks */ if (EndGetI32(&chunk[pmgl_Signature]) != 0x4C474D50) continue; if (EndGetI32(&chunk[pmgl_QuickRefSize]) < 2) { sys->message(fh, "WARNING; PMGL quickref area is too small"); } if (EndGetI32(&chunk[pmgl_QuickRefSize]) > (chm->chunk_size - pmgl_Entries)) { sys->message(fh, "WARNING; PMGL quickref area is too large"); } p = &chunk[pmgl_Entries]; end = &chunk[chm->chunk_size - 2]; num_entries = EndGetI16(end); while (num_entries--) { unsigned int name_len, section; off_t offset, length; name_len = read_encint(&p, end, &err); if (err || (name_len > (unsigned int) (end - p))) goto encint_err; name = p; p += name_len; section = read_encint(&p, end, &err); offset = read_encint(&p, end, &err); length = read_encint(&p, end, &err); if (err) goto encint_err; /* ignore blank or one-char (e.g. "/") filenames we'd return as blank */ if (name_len < 2 || !name[0] || !name[1]) continue; /* empty files and directory names are stored as a file entry at * offset 0 with length 0. We want to keep empty files, but not * directory names, which end with a "/" */ if ((offset == 0) && (length == 0)) { if ((name_len > 0) && (name[name_len-1] == '/')) continue; } if (section > 1) { sys->message(fh, "invalid section number '%u'.", section); continue; } if (!(fi = (struct mschmd_file *) sys->alloc(sys, sizeof(struct mschmd_file) + name_len + 1))) { sys->free(chunk); return MSPACK_ERR_NOMEMORY; } fi->next = NULL; fi->filename = (char *) &fi[1]; fi->section = ((section == 0) ? (struct mschmd_section *) (&chm->sec0) : (struct mschmd_section *) (&chm->sec1)); fi->offset = offset; fi->length = length; sys->copy((unsigned char *) name, fi->filename, (size_t) name_len); fi->filename[name_len] = '\0'; if (name[0] == ':' && name[1] == ':') { /* system file */ if (name_len == 40 && memcmp(name, content_name, 40) == 0) { chm->sec1.content = fi; } else if (name_len == 44 && memcmp(name, control_name, 44) == 0) { chm->sec1.control = fi; } else if (name_len == 41 && memcmp(name, spaninfo_name, 41) == 0) { chm->sec1.spaninfo = fi; } else if (name_len == 105 && memcmp(name, rtable_name, 105) == 0) { chm->sec1.rtable = fi; } fi->next = chm->sysfiles; chm->sysfiles = fi; } else { /* normal file */ if (link) link->next = fi; else chm->files = fi; link = fi; } } /* this is reached either when num_entries runs out, or if * an ENCINT is badly encoded */ encint_err: if (num_entries >= 0) { D(("bad encint before all entries could be read")) errors++; } } sys->free(chunk); return (errors > 0) ? MSPACK_ERR_DATAFORMAT : MSPACK_ERR_OK; } /*************************************** * CHMD_FAST_FIND *************************************** * uses PMGI index chunks and quickref data to quickly locate a file * directly from the on-disk index. * * TODO: protect against infinite loops in chunks (where pgml_NextChunk * or a PMGI index entry point to an already visited chunk) */ static int chmd_fast_find(struct mschm_decompressor *base, struct mschmd_header *chm, const char *filename, struct mschmd_file *f_ptr, int f_size) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; struct mspack_system *sys; struct mspack_file *fh; /* p and end are initialised to prevent MSVC warning about "potentially" * uninitialised usage. This is provably untrue, but MS won't fix: * https://developercommunity.visualstudio.com/content/problem/363489/c4701-false-positive-warning.html */ const unsigned char *chunk, *p = NULL, *end = NULL; int err = MSPACK_ERR_OK, result = -1; unsigned int n, sec; if (!self || !chm || !f_ptr || (f_size != sizeof(struct mschmd_file))) { return MSPACK_ERR_ARGS; } sys = self->system; /* clear the results structure */ memset(f_ptr, 0, f_size); if (!(fh = sys->open(sys, chm->filename, MSPACK_SYS_OPEN_READ))) { return MSPACK_ERR_OPEN; } /* go through PMGI chunk hierarchy to reach PMGL chunk */ if (chm->index_root < chm->num_chunks) { n = chm->index_root; for (;;) { if (!(chunk = read_chunk(self, chm, fh, n))) { sys->close(fh); return self->error; } /* search PMGI/PMGL chunk. exit early if no entry found */ if ((result = search_chunk(chm, chunk, filename, &p, &end)) <= 0) { break; } /* found result. loop around for next chunk if this is PMGI */ if (chunk[3] == 0x4C) break; n = read_encint(&p, end, &err); if (err) goto encint_err; } } else { /* PMGL chunks only, search from first_pmgl to last_pmgl */ for (n = chm->first_pmgl; n <= chm->last_pmgl; n = EndGetI32(&chunk[pmgl_NextChunk])) { if (!(chunk = read_chunk(self, chm, fh, n))) { err = self->error; break; } /* search PMGL chunk. exit if file found */ if ((result = search_chunk(chm, chunk, filename, &p, &end)) > 0) { break; } /* stop simple infinite loops: can't visit the same chunk twice */ if (n == EndGetI32(&chunk[pmgl_NextChunk])) { break; } } } /* if we found a file, read it */ if (result > 0) { sec = read_encint(&p, end, &err); f_ptr->section = (sec == 0) ? (struct mschmd_section *) &chm->sec0 : (struct mschmd_section *) &chm->sec1; f_ptr->offset = read_encint(&p, end, &err); f_ptr->length = read_encint(&p, end, &err); if (err) goto encint_err; } else if (result < 0) { err = MSPACK_ERR_DATAFORMAT; } sys->close(fh); return self->error = err; encint_err: D(("bad encint in PGMI/PGML chunk")) sys->close(fh); return self->error = MSPACK_ERR_DATAFORMAT; } /* reads the given chunk into memory, storing it in a chunk cache * so it doesn't need to be read from disk more than once */ static unsigned char *read_chunk(struct mschm_decompressor_p *self, struct mschmd_header *chm, struct mspack_file *fh, unsigned int chunk_num) { struct mspack_system *sys = self->system; unsigned char *buf; /* check arguments - most are already checked by chmd_fast_find */ if (chunk_num >= chm->num_chunks) return NULL; /* ensure chunk cache is available */ if (!chm->chunk_cache) { size_t size = sizeof(unsigned char *) * chm->num_chunks; if (!(chm->chunk_cache = (unsigned char **) sys->alloc(sys, size))) { self->error = MSPACK_ERR_NOMEMORY; return NULL; } memset(chm->chunk_cache, 0, size); } /* try to answer out of chunk cache */ if (chm->chunk_cache[chunk_num]) return chm->chunk_cache[chunk_num]; /* need to read chunk - allocate memory for it */ if (!(buf = (unsigned char *) sys->alloc(sys, chm->chunk_size))) { self->error = MSPACK_ERR_NOMEMORY; return NULL; } /* seek to block and read it */ if (sys->seek(fh, (off_t) (chm->dir_offset + (chunk_num * chm->chunk_size)), MSPACK_SYS_SEEK_START)) { self->error = MSPACK_ERR_SEEK; sys->free(buf); return NULL; } if (sys->read(fh, buf, (int)chm->chunk_size) != (int)chm->chunk_size) { self->error = MSPACK_ERR_READ; sys->free(buf); return NULL; } /* check the signature. Is is PMGL or PMGI? */ if (!((buf[0] == 0x50) && (buf[1] == 0x4D) && (buf[2] == 0x47) && ((buf[3] == 0x4C) || (buf[3] == 0x49)))) { self->error = MSPACK_ERR_SEEK; sys->free(buf); return NULL; } /* all OK. Store chunk in cache and return it */ return chm->chunk_cache[chunk_num] = buf; } /* searches a PMGI/PMGL chunk for a given filename entry. Returns -1 on * data format error, 0 if entry definitely not found, 1 if entry * found. In the latter case, *result and *result_end are set pointing * to that entry's data (either the "next chunk" ENCINT for a PMGI or * the section, offset and length ENCINTs for a PMGL). * * In the case of PMGL chunks, the entry has definitely been * found. In the case of PMGI chunks, the entry which points to the * chunk that may eventually contain that entry has been found. */ static int search_chunk(struct mschmd_header *chm, const unsigned char *chunk, const char *filename, const unsigned char **result, const unsigned char **result_end) { const unsigned char *start, *end, *p; unsigned int qr_size, num_entries, qr_entries, qr_density, name_len; unsigned int L, R, M, fname_len, entries_off, is_pmgl; int cmp, err = 0; fname_len = strlen(filename); /* PMGL chunk or PMGI chunk? (note: read_chunk() has already * checked the rest of the characters in the chunk signature) */ if (chunk[3] == 0x4C) { is_pmgl = 1; entries_off = pmgl_Entries; } else { is_pmgl = 0; entries_off = pmgi_Entries; } /* Step 1: binary search first filename of each QR entry * - target filename == entry * found file * - target filename < all entries * file not found * - target filename > all entries * proceed to step 2 using final entry * - target filename between two searched entries * proceed to step 2 */ qr_size = EndGetI32(&chunk[pmgl_QuickRefSize]); start = &chunk[chm->chunk_size - 2]; end = &chunk[chm->chunk_size - qr_size]; num_entries = EndGetI16(start); qr_density = 1 + (1 << chm->density); qr_entries = (num_entries + qr_density-1) / qr_density; if (num_entries == 0) { D(("chunk has no entries")) return -1; } if (qr_size > chm->chunk_size) { D(("quickref size > chunk size")) return -1; } *result_end = end; if (((int)qr_entries * 2) > (start - end)) { D(("WARNING; more quickrefs than quickref space")) qr_entries = 0; /* but we can live with it */ } if (qr_entries > 0) { L = 0; R = qr_entries - 1; do { /* pick new midpoint */ M = (L + R) >> 1; /* compare filename with entry QR points to */ p = &chunk[entries_off + (M ? EndGetI16(start - (M << 1)) : 0)]; name_len = read_encint(&p, end, &err); if (err || (name_len > (unsigned int) (end - p))) goto encint_err; cmp = compare(filename, (char *)p, fname_len, name_len); if (cmp == 0) break; else if (cmp < 0) { if (M) R = M - 1; else return 0; } else if (cmp > 0) L = M + 1; } while (L <= R); M = (L + R) >> 1; if (cmp == 0) { /* exact match! */ p += name_len; *result = p; return 1; } /* otherwise, read the group of entries for QR entry M */ p = &chunk[entries_off + (M ? EndGetI16(start - (M << 1)) : 0)]; num_entries -= (M * qr_density); if (num_entries > qr_density) num_entries = qr_density; } else { p = &chunk[entries_off]; } /* Step 2: linear search through the set of entries reached in step 1. * - filename == any entry * found entry * - filename < all entries (PMGI) or any entry (PMGL) * entry not found, stop now * - filename > all entries * entry not found (PMGL) / maybe found (PMGI) * - */ *result = NULL; while (num_entries-- > 0) { name_len = read_encint(&p, end, &err); if (err || (name_len > (unsigned int) (end - p))) goto encint_err; cmp = compare(filename, (char *)p, fname_len, name_len); p += name_len; if (cmp == 0) { /* entry found */ *result = p; return 1; } if (cmp < 0) { /* entry not found (PMGL) / maybe found (PMGI) */ break; } /* read and ignore the rest of this entry */ if (is_pmgl) { while (p < end && (*p++ & 0x80)); /* skip section ENCINT */ while (p < end && (*p++ & 0x80)); /* skip offset ENCINT */ while (p < end && (*p++ & 0x80)); /* skip length ENCINT */ } else { *result = p; /* store potential final result */ while (p < end && (*p++ & 0x80)); /* skip chunk number ENCINT */ } } /* PMGL? not found. PMGI? maybe found */ return (is_pmgl) ? 0 : (*result ? 1 : 0); encint_err: D(("bad encint while searching")) return -1; } #if HAVE_TOWLOWER # include # define TOLOWER(x) towlower(x) #else # include # define TOLOWER(x) tolower(x) #endif /* decodes a UTF-8 character from s[] into c. Will not read past e. * doesn't test that extension bytes are %10xxxxxx. * allows some overlong encodings. */ #define GET_UTF8_CHAR(s, e, c) do { \ unsigned char x = *s++; \ if (x < 0x80) c = x; \ else if (x >= 0xC2 && x < 0xE0 && s < e) { \ c = (x & 0x1F) << 6 | (*s++ & 0x3F); \ } \ else if (x >= 0xE0 && x < 0xF0 && s+1 < e) { \ c = (x & 0x0F) << 12 | (s[0] & 0x3F) << 6 | (s[1] & 0x3F); \ s += 2; \ } \ else if (x >= 0xF0 && x <= 0xF5 && s+2 < e) { \ c = (x & 0x07) << 18 | (s[0] & 0x3F) << 12 | \ (s[1] & 0x3F) << 6 | (s[2] & 0x3F); \ if (c > 0x10FFFF) c = 0xFFFD; \ s += 3; \ } \ else c = 0xFFFD; \ } while (0) /* case-insensitively compares two UTF8 encoded strings. String length for * both strings must be provided, null bytes are not terminators */ static inline int compare(const char *s1, const char *s2, int l1, int l2) { register const unsigned char *p1 = (const unsigned char *) s1; register const unsigned char *p2 = (const unsigned char *) s2; register const unsigned char *e1 = p1 + l1, *e2 = p2 + l2; int c1, c2; while (p1 < e1 && p2 < e2) { GET_UTF8_CHAR(p1, e1, c1); GET_UTF8_CHAR(p2, e2, c2); if (c1 == c2) continue; c1 = TOLOWER(c1); c2 = TOLOWER(c2); if (c1 != c2) return c1 - c2; } return l1 - l2; } /*************************************** * CHMD_EXTRACT *************************************** * extracts a file from a CHM helpfile */ static int chmd_extract(struct mschm_decompressor *base, struct mschmd_file *file, const char *filename) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; struct mspack_system *sys; struct mschmd_header *chm; struct mspack_file *fh; off_t bytes; if (!self) return MSPACK_ERR_ARGS; if (!file || !file->section) return self->error = MSPACK_ERR_ARGS; sys = self->system; chm = file->section->chm; /* create decompression state if it doesn't exist */ if (!self->d) { self->d = (struct mschmd_decompress_state *) sys->alloc(sys, sizeof(struct mschmd_decompress_state)); if (!self->d) return self->error = MSPACK_ERR_NOMEMORY; self->d->chm = chm; self->d->offset = 0; self->d->state = NULL; self->d->sys = *sys; self->d->sys.write = &chmd_sys_write; self->d->infh = NULL; self->d->outfh = NULL; } /* open input chm file if not open, or the open one is a different chm */ if (!self->d->infh || (self->d->chm != chm)) { if (self->d->infh) sys->close(self->d->infh); if (self->d->state) lzxd_free(self->d->state); self->d->chm = chm; self->d->offset = 0; self->d->state = NULL; self->d->infh = sys->open(sys, chm->filename, MSPACK_SYS_OPEN_READ); if (!self->d->infh) return self->error = MSPACK_ERR_OPEN; } /* open file for output */ if (!(fh = sys->open(sys, filename, MSPACK_SYS_OPEN_WRITE))) { return self->error = MSPACK_ERR_OPEN; } /* if file is empty, simply creating it is enough */ if (!file->length) { sys->close(fh); return self->error = MSPACK_ERR_OK; } self->error = MSPACK_ERR_OK; switch (file->section->id) { case 0: /* Uncompressed section file */ /* simple seek + copy */ if (sys->seek(self->d->infh, chm->sec0.offset + file->offset, MSPACK_SYS_SEEK_START)) { self->error = MSPACK_ERR_SEEK; } else { unsigned char buf[512]; off_t length = file->length; off_t maxlen = chm->length - sys->tell(self->d->infh); if (length > maxlen) { sys->message(fh, "WARNING; file is %" LD " bytes longer than CHM file", length - maxlen); } while (length > 0) { int run = sizeof(buf); if ((off_t)run > length) run = (int)length; if (sys->read(self->d->infh, &buf[0], run) != run) { self->error = MSPACK_ERR_READ; break; } if (sys->write(fh, &buf[0], run) != run) { self->error = MSPACK_ERR_WRITE; break; } length -= run; } } break; case 1: /* MSCompressed section file */ /* (re)initialise compression state if not yet initialised, * or we have advanced too far and have to backtrack */ if (!self->d->state || (file->offset < self->d->offset)) { if (self->d->state) { lzxd_free(self->d->state); self->d->state = NULL; } if (chmd_init_decomp(self, file)) break; } /* check file offset is not impossible */ if (file->offset > self->d->length) { self->error = MSPACK_ERR_DECRUNCH; break; } /* seek to input data */ if (sys->seek(self->d->infh, self->d->inoffset, MSPACK_SYS_SEEK_START)) { self->error = MSPACK_ERR_SEEK; break; } /* get to correct offset. */ self->d->outfh = NULL; if ((bytes = file->offset - self->d->offset)) { self->error = lzxd_decompress(self->d->state, bytes); } /* if getting to the correct offset was error free, unpack file */ if (!self->error) { off_t length = file->length; off_t maxlen = self->d->length - file->offset; if (length > maxlen) { sys->message(fh, "WARNING; file is %" LD " bytes longer than " "compressed section", length - maxlen); length = maxlen + 1; /* should decompress but still error out */ } self->d->outfh = fh; self->error = lzxd_decompress(self->d->state, length); } /* save offset in input source stream, in case there is a section 0 * file between now and the next section 1 file extracted */ self->d->inoffset = sys->tell(self->d->infh); /* if an LZX error occured, the LZX decompressor is now useless */ if (self->error) { if (self->d->state) lzxd_free(self->d->state); self->d->state = NULL; } break; } sys->close(fh); return self->error; } /*************************************** * CHMD_SYS_WRITE *************************************** * chmd_sys_write is the internal writer function which the decompressor * uses. If either writes data to disk (self->d->outfh) with the real * sys->write() function, or does nothing with the data when * self->d->outfh == NULL. advances self->d->offset. */ static int chmd_sys_write(struct mspack_file *file, void *buffer, int bytes) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) file; self->d->offset += bytes; if (self->d->outfh) { return self->system->write(self->d->outfh, buffer, bytes); } return bytes; } /*************************************** * CHMD_INIT_DECOMP *************************************** * Initialises the LZX decompressor to decompress the compressed stream, * from the nearest reset offset and length that is needed for the given * file. */ static int chmd_init_decomp(struct mschm_decompressor_p *self, struct mschmd_file *file) { int window_size, window_bits, reset_interval, entry, err; struct mspack_system *sys = self->system; struct mschmd_sec_mscompressed *sec; unsigned char *data; off_t length, offset; sec = (struct mschmd_sec_mscompressed *) file->section; /* ensure we have a mscompressed content section */ err = find_sys_file(self, sec, &sec->content, content_name); if (err) return self->error = err; /* ensure we have a ControlData file */ err = find_sys_file(self, sec, &sec->control, control_name); if (err) return self->error = err; /* read ControlData */ if (sec->control->length != lzxcd_SIZEOF) { D(("ControlData file is wrong size")) return self->error = MSPACK_ERR_DATAFORMAT; } if (!(data = read_sys_file(self, sec->control))) { D(("can't read mscompressed control data file")) return self->error; } /* check LZXC signature */ if (EndGetI32(&data[lzxcd_Signature]) != 0x43585A4C) { sys->free(data); return self->error = MSPACK_ERR_SIGNATURE; } /* read reset_interval and window_size and validate version number */ switch (EndGetI32(&data[lzxcd_Version])) { case 1: reset_interval = EndGetI32(&data[lzxcd_ResetInterval]); window_size = EndGetI32(&data[lzxcd_WindowSize]); break; case 2: reset_interval = EndGetI32(&data[lzxcd_ResetInterval]) * LZX_FRAME_SIZE; window_size = EndGetI32(&data[lzxcd_WindowSize]) * LZX_FRAME_SIZE; break; default: D(("bad controldata version")) sys->free(data); return self->error = MSPACK_ERR_DATAFORMAT; } /* free ControlData */ sys->free(data); /* find window_bits from window_size */ switch (window_size) { case 0x008000: window_bits = 15; break; case 0x010000: window_bits = 16; break; case 0x020000: window_bits = 17; break; case 0x040000: window_bits = 18; break; case 0x080000: window_bits = 19; break; case 0x100000: window_bits = 20; break; case 0x200000: window_bits = 21; break; default: D(("bad controldata window size")) return self->error = MSPACK_ERR_DATAFORMAT; } /* validate reset_interval */ if (reset_interval == 0 || reset_interval % LZX_FRAME_SIZE) { D(("bad controldata reset interval")) return self->error = MSPACK_ERR_DATAFORMAT; } /* which reset table entry would we like? */ entry = file->offset / reset_interval; /* convert from reset interval multiple (usually 64k) to 32k frames */ entry *= reset_interval / LZX_FRAME_SIZE; /* read the reset table entry */ if (read_reset_table(self, sec, entry, &length, &offset)) { /* the uncompressed length given in the reset table is dishonest. * the uncompressed data is always padded out from the given * uncompressed length up to the next reset interval */ length += reset_interval - 1; length &= -reset_interval; } else { /* if we can't read the reset table entry, just start from * the beginning. Use spaninfo to get the uncompressed length */ entry = 0; offset = 0; err = read_spaninfo(self, sec, &length); if (err) return self->error = err; } /* get offset of compressed data stream: * = offset of uncompressed section from start of file * + offset of compressed stream from start of uncompressed section * + offset of chosen reset interval from start of compressed stream */ self->d->inoffset = file->section->chm->sec0.offset + sec->content->offset + offset; /* set start offset and overall remaining stream length */ self->d->offset = entry * LZX_FRAME_SIZE; self->d->length = length; length -= self->d->offset; /* initialise LZX stream */ self->d->state = lzxd_init(&self->d->sys, self->d->infh, (struct mspack_file *) self, window_bits, reset_interval / LZX_FRAME_SIZE, 4096, length, 0); if (!self->d->state) self->error = MSPACK_ERR_NOMEMORY; return self->error; } /*************************************** * READ_RESET_TABLE *************************************** * Reads one entry out of the reset table. Also reads the uncompressed * data length. Writes these to offset_ptr and length_ptr respectively. * Returns non-zero for success, zero for failure. */ static int read_reset_table(struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, unsigned int entry, off_t *length_ptr, off_t *offset_ptr) { struct mspack_system *sys = self->system; unsigned char *data; unsigned int pos, entrysize; /* do we have a ResetTable file? */ int err = find_sys_file(self, sec, &sec->rtable, rtable_name); if (err) return 0; /* read ResetTable file */ if (sec->rtable->length < lzxrt_headerSIZEOF) { D(("ResetTable file is too short")) return 0; } if (sec->rtable->length > 1000000) { /* arbitrary upper limit */ D(("ResetTable >1MB (%"LD"), report if genuine", sec->rtable->length)) return 0; } if (!(data = read_sys_file(self, sec->rtable))) { D(("can't read reset table")) return 0; } /* check sanity of reset table */ if (EndGetI32(&data[lzxrt_FrameLen]) != LZX_FRAME_SIZE) { D(("bad reset table frame length")) sys->free(data); return 0; } /* get the uncompressed length of the LZX stream */ if (read_off64(length_ptr, &data[lzxrt_UncompLen], sys, self->d->infh)) { sys->free(data); return 0; } entrysize = EndGetI32(&data[lzxrt_EntrySize]); pos = EndGetI32(&data[lzxrt_TableOffset]) + (entry * entrysize); /* ensure reset table entry for this offset exists */ if (entry < EndGetI32(&data[lzxrt_NumEntries]) && pos <= (sec->rtable->length - entrysize)) { switch (entrysize) { case 4: *offset_ptr = EndGetI32(&data[pos]); err = 0; break; case 8: err = read_off64(offset_ptr, &data[pos], sys, self->d->infh); break; default: D(("reset table entry size neither 4 nor 8")) err = 1; break; } } else { D(("bad reset interval")) err = 1; } /* free the reset table */ sys->free(data); /* return success */ return (err == 0); } /*************************************** * READ_SPANINFO *************************************** * Reads the uncompressed data length from the spaninfo file. * Returns zero for success or a non-zero error code for failure. */ static int read_spaninfo(struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, off_t *length_ptr) { struct mspack_system *sys = self->system; unsigned char *data; /* find SpanInfo file */ int err = find_sys_file(self, sec, &sec->spaninfo, spaninfo_name); if (err) return MSPACK_ERR_DATAFORMAT; /* check it's large enough */ if (sec->spaninfo->length != 8) { D(("SpanInfo file is wrong size")) return MSPACK_ERR_DATAFORMAT; } /* unconditionally set length here, because gcc -Wuninitialized isn't * clever enough to recognise that read_sys_file() will always set * self->error to a non-zero value if it returns NULL, and gcc warnings * spook humans (even false positives) */ *length_ptr = 0; /* read the SpanInfo file */ if (!(data = read_sys_file(self, sec->spaninfo))) { D(("can't read SpanInfo file")) return self->error; } /* get the uncompressed length of the LZX stream */ err = read_off64(length_ptr, data, sys, self->d->infh); sys->free(data); if (err) return MSPACK_ERR_DATAFORMAT; if (*length_ptr <= 0) { D(("output length is invalid")) return MSPACK_ERR_DATAFORMAT; } return MSPACK_ERR_OK; } /*************************************** * FIND_SYS_FILE *************************************** * Uses chmd_fast_find to locate a system file, and fills out that system * file's entry and links it into the list of system files. Returns zero * for success, non-zero for both failure and the file not existing. */ static int find_sys_file(struct mschm_decompressor_p *self, struct mschmd_sec_mscompressed *sec, struct mschmd_file **f_ptr, const char *name) { struct mspack_system *sys = self->system; struct mschmd_file result; /* already loaded */ if (*f_ptr) return MSPACK_ERR_OK; /* try using fast_find to find the file - return DATAFORMAT error if * it fails, or successfully doesn't find the file */ if (chmd_fast_find((struct mschm_decompressor *) self, sec->base.chm, name, &result, (int)sizeof(result)) || !result.section) { return MSPACK_ERR_DATAFORMAT; } if (!(*f_ptr = (struct mschmd_file *) sys->alloc(sys, sizeof(result)))) { return MSPACK_ERR_NOMEMORY; } /* copy result */ *(*f_ptr) = result; (*f_ptr)->filename = (char *) name; /* link file into sysfiles list */ (*f_ptr)->next = sec->base.chm->sysfiles; sec->base.chm->sysfiles = *f_ptr; return MSPACK_ERR_OK; } /*************************************** * READ_SYS_FILE *************************************** * Allocates memory for a section 0 (uncompressed) file and reads it into * memory. */ static unsigned char *read_sys_file(struct mschm_decompressor_p *self, struct mschmd_file *file) { struct mspack_system *sys = self->system; unsigned char *data = NULL; int len; if (!file || !file->section || (file->section->id != 0)) { self->error = MSPACK_ERR_DATAFORMAT; return NULL; } len = (int) file->length; if (!(data = (unsigned char *) sys->alloc(sys, (size_t) len))) { self->error = MSPACK_ERR_NOMEMORY; return NULL; } if (sys->seek(self->d->infh, file->section->chm->sec0.offset + file->offset, MSPACK_SYS_SEEK_START)) { self->error = MSPACK_ERR_SEEK; sys->free(data); return NULL; } if (sys->read(self->d->infh, data, len) != len) { self->error = MSPACK_ERR_READ; sys->free(data); return NULL; } return data; } /*************************************** * CHMD_ERROR *************************************** * returns the last error that occurred */ static int chmd_error(struct mschm_decompressor *base) { struct mschm_decompressor_p *self = (struct mschm_decompressor_p *) base; return (self) ? self->error : MSPACK_ERR_ARGS; } /*************************************** * READ_OFF64 *************************************** * Reads a 64-bit signed integer from memory in Intel byte order. * If running on a system with a 64-bit off_t, this is simply done. * If running on a system with a 32-bit off_t, offsets up to 0x7FFFFFFF * are accepted, offsets beyond that cause an error message. */ static int read_off64(off_t *var, unsigned char *mem, struct mspack_system *sys, struct mspack_file *fh) { #if SIZEOF_OFF_T >= 8 *var = EndGetI64(mem); #else if ((mem[3] & 0x80) | mem[4] | mem[5] | mem[6] | mem[7]) { sys->message(fh, "library not compiled to support large files."); return 1; } *var = EndGetI32(mem); #endif return 0; } #if SIZEOF_OFF_T >= 8 /* 63 bits allowed: 9 * 7 bits/byte, last byte must be 0x00-0x7F */ # define ENCINT_MAX_BYTES 9 # define ENCINT_BAD_LAST_BYTE 0x80 #else /* 31 bits allowed: 5 * 7 bits/byte, last byte must be 0x00-0x07 */ # define ENCINT_MAX_BYTES 5 # define ENCINT_BAD_LAST_BYTE 0xF1 #endif /*************************************** * READ_ENCINT *************************************** * Reads an ENCINT from memory. If running on a system with a 32-bit off_t, * ENCINTs up to 0x7FFFFFFF are accepted, values beyond that are an error. */ static off_t read_encint(const unsigned char **p, const unsigned char *end, int *err) { off_t result = 0; unsigned char c = 0x80; int i = 0; while ((c & 0x80) && (i++ < ENCINT_MAX_BYTES)) { if (*p >= end) { *err = 1; return 0; } c = *(*p)++; result = (result << 7) | (c & 0x7F); } if (i == ENCINT_MAX_BYTES && (c & ENCINT_BAD_LAST_BYTE)) { *err = 1; return 0; } return result; }