Revision 238 (by dpavlin, 2004/03/08 17:46:16) tagging openisis 0.9.0
/*
	openisis - an open implementation of the CDS/ISIS database
	Version 0.8.x (patchlevel see file Version)
	Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org

	This library is free software; you can redistribute it and/or
	modify it under the terms of the GNU Lesser General Public
	License as published by the Free Software Foundation; either
	version 2.1 of the License, or (at your option) any later version.

	This library is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
	Lesser General Public License for more details.

	You should have received a copy of the GNU Lesser General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	see README for more information
EOH */
#ifndef LDB_H

/*
	$Id: ldb.h,v 1.19 2003/04/08 00:20:52 kripke Exp $
	package interface of the general db access functions.
*/

#include "luti.h"
#include "lll.h"

/* #define LDB_MAGIC  0x0FE91515 read OPENISIS */ 


enum { /* MFR members */
	LMFR_MFN = 1, /* the rowid */
	LMFR_RECL,	/* total external length (even), negative for locked record */
	LMFR_BWB, /* block of rec's previous version */
	LMFR_BWP, /* pos of " */
	LMFR_BASE,	/* offset of contents area */
	LMFR_NVF,	/* number of fields */
	LMFR_STAT,	/* state, if != 0, about to be deleted */
	LMFR__FL, /* offset of repeated part */
	LMFR_TAG = 0,
	LMFR_POS,
	LMFR_LEN,
	LMFR__RL /* length of repeated part */
};


/**
	read a raw isis record by rowid.
	the memory must be freed after usage.
*/
extern int *ldb_readRec ( int db, int rowid );


/**
	numerical type for a records file position.
	To support large DBs, this may be lll.
*/
typedef int lxref;


/**
	read a raw isis record by xref.
	the memory must be freed after usage.
*/
extern int *ldb_readRecAtOff ( int dbid, lxref off, int *nxtoff );




/*
**	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
**	detailed search structures and functions
**
**	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/

enum {
	LDB_MAX_KEYLEN = 31, /* actually, it's 30 for isis-1 */
	LDB_TERMBUF = 128, /* ints */
	LDB_INDEXES = 2
};


/**
	the isis-1 posting is an 64bit number.
	In the file it's in big endian order, to allow ordering using memcmp.
	In the LdbP we have native order to use native 64bit.
	Until we need more bits for postings, we may use the 64bit type
	provided by gcc and some other compilers.

	the 8 bytes are: highest 3 for the rowid, then 2 tag, then 3 pos.
	the 3 pos bytes in turn are 1 byte for the occurence
	and 2 bytes for the word count.
	(e.g. value 2<<16 | 7 if term was found in the 7th word of
	the 2nd occurence of field 'tag').
	we usually don't care, since for a near condition we always
	want a small distance w/in the same occurence which may be
	conveniently checked by the difference of 3 byte pos values.

	BUT NOTE: ... unfortunately we DO NEED one more bit for marking
	postings during AND operations. we abuse the highest bit of the
	word counter, since it's for sure useless for any reasonable
	distance check.
	*** BE CAREFUL WHEN COMPARING POSTINGS DURING "AND" OP ***
	use the LDBP_xxM macros below.
*/
typedef union {
	char bytes[8];
	lll  val;
} LdbP;



/* all macros operate on pointers */
/* access parts */
#define LDBP_ROW( p ) (0xffffffL & (int)((p)->val >> 40))
#define LDBP_TAG( p ) (  0xffffL & (int)((p)->val >> 24))
#define LDBP_POS( p ) (0xff7fffL & (int)((p)->val))
#define LDBP_MARK( p ) (  0x8000 & (int)((p)->val))

#define LDBP_SETMARK( p ) ((p)->val |= LLL(            0x8000))
#define LDBP_CLRMARK( p ) ((p)->val &= LLL(0xffffffffffff7fff))
/* value w/o the mark bit */
#define LDBP_IGNMARK( p ) ((p)->val & LLL(0xffffffffffff7fff))

/* compare according to rowid, tag, pos */
#define LDBP_EQ( a, b ) ((a)->val == (b)->val)
#define LDBP_GT( a, b ) ((a)->val > (b)->val)
/* compare IGNORING MARK according to rowid, tag, pos */
#define LDBP_EQM( a, b ) (LDBP_IGNMARK(a) == LDBP_IGNMARK(b))
#define LDBP_GTM( a, b ) (LDBP_IGNMARK(a) > LDBP_IGNMARK(b))

/* manipulation: set bottom / top for row of given posting */
#define LDBP_SETROWBOT( d, s ) ((d)->val = (s)->val & LLL(0xffffff0000000000))
#define LDBP_SETROWTOP( d, s ) ((d)->val = (s)->val | LLL(      0xffffffffff))


/**
	postings come in arrays with some header.
	The standard structure gives a 8k buffer,
	but member len may give other actual length.
*/
typedef struct {
	short mode; /* in: merge flags */
	short near; /* in: near distance; in OR mode: collect pos info */
	int   tag; /* in: tag, to which postings are restricted */
	int   skp; /* in: ignore mfns < skp */
	int   len; /* in: length (# of postings) of buffer (if 0 : default length) */
	int   fil; /* io: number of postings actually used */
	int   cut; /* io: min mfn ignored due to buffer length */
	LdbP p[OPENISIS_SETLEN];
	/**
		8 bytes as in IFP file: mfn[3],tag[2],occ[1],cnt[2]
		highest bit of cnt (1LL<<15) is used as mark
	*/
} LdbPost;

enum {
	LDB_OR,
	LDB_AND,
	LDB_NOT, /* like AND, but keep unmarked postings */
	LDB_PFX = 4, /* prefix match */
	LDB_KEEPMARKS = 8 /* do not compact after AND/NOT */
};

enum {
	LDB_NEAR_F = 0x7fff, /* the (F): same occurence of field */
	LDB_NEAR_G = -0x8000 /* the (G): same field */
};

/**
*/
extern int ldb_search ( int db, const char *key, LdbPost *post,
	OpenIsisRec *rec );


extern int ldb_p2s ( OpenIsisSet *set, LdbPost *post );

extern Db* ldb_getdb (int dbid);

#define LDB_H
#endif /* LDB_H */