Revision 237 (by dpavlin, 2004/03/08 17:43:12) initial import of openisis 0.9.0 vendor drop
#!/bin/bash
#/*
#	openisis - an open implementation of the CDS/ISIS database
#	Version 0.8.x (patchlevel see file Version)
#	Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
#
#	This library is free software; you can redistribute it and/or
#	modify it under the terms of the GNU Lesser General Public
#	License as published by the Free Software Foundation; either
#	version 2.1 of the License, or (at your option) any later version.
#
#	This library is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#	Lesser General Public License for more details.
#
#	You should have received a copy of the GNU Lesser General Public
#	License along with this library; if not, write to the Free Software
#	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#	see README for more information
#EOH */
#
#
CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $'
#
#	utility to convert several document types to text
#	used for mail archiving
#	similar to anytopnm
#

# assumptions:
# filename is a name of the file in the current directory
#	we are allowed to use any filename.*
#

#	return code:
# the lowest valued bit denotes that there is a plain version in filename.txt
#
#	0 file was fine text and is unmodified
#	1	file was sort of (non-iso,long-lined) text, plain version is filename.txt
#	2	file was formatted html text
#	3	file was formatted html text, plain version is filename.txt
#	4,5 likewise for pdf
#	6,7 likewise for rtf
#	8,9 likewise for doc (M$-w0rd)
#	16,17	file was mail text
#	32	gif image
#	34	jpg image
#	36	bmp image
#	38	png image
#	40	tiff image
#	46	other image
#	??	other
#	124	unknown
#	126	error
#

verbose=false

err () {
	echo "$*" >&2
	exit 126
}

dewindoofy () {
	tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY'
}

echo "any2txt $* ($PWD)" >&2

while [ 1 -lt $# ]; do # shift options
	case "$1" in
	-v)	verbose=true;;
	-m)	mtype="$2"; shift;;
	esac
	shift
done

# if $verbose; then
#	:
#else
	#	no output expected; so any output is error
	exec >&2
#fi

filename="$1"
[ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable"

ftype="$(file -b "$filename")"

code=124
case "$ftype" in
*"mail text"*)
	sed -e '1,/^$/d' <"$filename" >"$filename".txt
	code=17
	;;
*HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII
	code=2
	lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3
	;;
*"Non-ISO extended-ASCII"*"long lines") # assume windows stuff
	dewindoofy <"$filename" | fmt -s >"$filename".txt
	code=1
	;;
*"Non-ISO extended-ASCII"*) # assume windows stuff
	dewindoofy <"$filename" >"$filename".txt
	code=1
	;;
ASCII*text*"long lines"|ISO-8859*text*"long lines")
	fmt -s <"$filename" >"$filename".txt
	code=1
	;;
ASCII*text*|ISO-8859*text*)
	code=0
	;;
PDF*)
	code=4
	pdftotext -raw "$filename" - >"$filename".txt && code=5
	;;
"Rich Text Format"*)
	code=6
	unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7
	;;
"Microsoft Office Document"*)
	code=8
	antiword "$filename" >"$filename".txt && code=9
	;;
*"GIF image"*) code=32;;
*"JPEG image"*) code=34;;
*"PC bitmap"*) code=36;;
*"PNG image"*) code=38;;
*"TIFF image"*) code=40;;
*image*) code=46;;
esac

$verbose && echo "$filename: $ftype: $code" >&2

exit $code