Revision 238 (by dpavlin, 2004/03/08 17:46:16) tagging openisis 0.9.0
/*
	openisis - an open implementation of the CDS/ISIS database
	Version 0.8.x (patchlevel see file Version)
	Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org

	This library is free software; you can redistribute it and/or
	modify it under the terms of the GNU Lesser General Public
	License as published by the Free Software Foundation; either
	version 2.1 of the License, or (at your option) any later version.

	This library is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
	Lesser General Public License for more details.

	You should have received a copy of the GNU Lesser General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	see README for more information
EOH */
#ifndef LCS_H

/*
	$Id: lcs.h,v 1.3 2003/04/08 00:20:52 kripke Exp $
	charset support
*/

typedef enum {
	LCS_SINGLE, /* some ASCII-compliant single byte charset */
	LCS_UTF8, /* unicode in 8bit transfer encoding */
	LCS_UCS2 /* the unicode base multilingual plane (BMP, first 64K chars)
		in 2byte encoding, native (platform) byte order */
}	lcs_type;

typedef enum {
	LCS_LATIN1, /* the ISO-8859-1 (Latin 1) charset */
	LCS_CP850, /* MS-DOS Latin 1, known as the "OEM" charset */
	LCS__SBCS  /* # of single byte charsets */
}	lcs_single;


typedef enum {
	LCS_CTYPE, /* flags see below */
	LCS_UNICO, /* 128 unicodes */
	LCS_UCASE, /* each characters uppercase code */
	LCS_INDEX, /* single byte recoding for index ("strxfrm") */
	LCS_XEDNI, /* inverse */
	LCS_RECOD, /* single byte recoding to extern (typically Latin1) */
	LCS_DOCER, /* inverse */
	LCS__TABS
} lcs_tabid;


/* some conversion table on a single byte charset
	c may hold an actab (!0 for any character deemed "alpha"),
	uctab ("uppercase" replacement code, typically removing diacriticals),
	code-to-code conversion or other.
	u may hold unicode values for the upper half of an
	ASCII compatible single byte charset.
*/

typedef union {
	unsigned char  c[256];
	unsigned short u[128];
}	LcsTab;

enum {
	/* controls (0..31,127) */
	LCS_R = 0x00, /* record separator (FF,GS), */
	LCS_F = 0x10, /* field  separator (NUL,CR,LF,RS) */
	LCS_C = 0x28, /* other control */
	/* other separators */
	LCS_S = 0x48, /* space(blank,nbsp), */
	LCS_T = 0x68, /* other token separator (,:;=), */
	/* other punctuation assumed "word" characters */
	LCS_P = 0x80, 
	LCS_Y = 0xb0, /* symbol */
	/* identifiers */
	LCS_I = 0xc0, /* C-ident (_). */
	LCS_D = 0xd0, /* digits ('0'..'9') */
	LCS_A = 0xe0, /* ASCII alpha */
	LCS_L = 0xe4, /* ... among those formatting literals a,c,i,x */
	LCS_N = 0xf0  /* other alpha ("national"/non-ASCII) */
};

#define LCS_ISCONTR( t ) (!(0xc0 & (t)))
#define LCS_ISSPACE( t ) (LCS_S >= (t))
#define LCS_ISPRINT( t ) (0xc0 & (t)) /* (LCS_S <= (t)) */
#define LCS_ISIDENT( t ) (0xc0 == (0xc0 & (t)))
#define LCS_ISALPHA( t ) (0xe0 == (0xe0 & (t)))

/* for record parsing */
#define LCS_ISSEP( t )   (!(0x80 & (t)))
#define LCS_ISWORD( t )  (0x80 & (t))
#define LCS_ISFR( t )    (!(0xe0 & (t))) /* field or record separator */
#define LCS_ISCST( t )   (0x08 & (t)) /* other separator */


enum {
	LCS_MKUNI = -1 /* as "bits" value, have mktab create unicode table */
};

/** create the table dst from string containing
	a free style sequence of decimal numbers (ignoring any non-digit).
	if bits is 0, numbers are assigned sequentially to dst->c,
	else if bits < 0, they are assigned to dst->u,
	else for every number, bits are set in the corresponding array element
	else something strange may happen.
*/
extern int lcs_mktab ( LcsTab *dst, char *numbers, int len, int bits );

/** create single byte conversion table from two unicode tables.
	if to is NULL, the trivial (Latin1) table is used.
*/
extern int lcs_mkrecod ( unsigned char *dst, unsigned short *from,
	unsigned short *to );

extern unsigned char lcs_latin1_uc[256];
extern unsigned char lcs_latin1_ct[256];

#define LCS_H
#endif /* LCS_H */