| Revision 237 (by dpavlin, 2004/03/08 17:43:12) |
initial import of openisis 0.9.0 vendor drop
|
#!/bin/bash
#/*
# openisis - an open implementation of the CDS/ISIS database
# Version 0.8.x (patchlevel see file Version)
# Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# see README for more information
#EOH */
#
#
CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $'
#
# utility to convert several document types to text
# used for mail archiving
# similar to anytopnm
#
# assumptions:
# filename is a name of the file in the current directory
# we are allowed to use any filename.*
#
# return code:
# the lowest valued bit denotes that there is a plain version in filename.txt
#
# 0 file was fine text and is unmodified
# 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt
# 2 file was formatted html text
# 3 file was formatted html text, plain version is filename.txt
# 4,5 likewise for pdf
# 6,7 likewise for rtf
# 8,9 likewise for doc (M$-w0rd)
# 16,17 file was mail text
# 32 gif image
# 34 jpg image
# 36 bmp image
# 38 png image
# 40 tiff image
# 46 other image
# ?? other
# 124 unknown
# 126 error
#
verbose=false
err () {
echo "$*" >&2
exit 126
}
dewindoofy () {
tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY'
}
echo "any2txt $* ($PWD)" >&2
while [ 1 -lt $# ]; do # shift options
case "$1" in
-v) verbose=true;;
-m) mtype="$2"; shift;;
esac
shift
done
# if $verbose; then
# :
#else
# no output expected; so any output is error
exec >&2
#fi
filename="$1"
[ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable"
ftype="$(file -b "$filename")"
code=124
case "$ftype" in
*"mail text"*)
sed -e '1,/^$/d' <"$filename" >"$filename".txt
code=17
;;
*HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII
code=2
lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3
;;
*"Non-ISO extended-ASCII"*"long lines") # assume windows stuff
dewindoofy <"$filename" | fmt -s >"$filename".txt
code=1
;;
*"Non-ISO extended-ASCII"*) # assume windows stuff
dewindoofy <"$filename" >"$filename".txt
code=1
;;
ASCII*text*"long lines"|ISO-8859*text*"long lines")
fmt -s <"$filename" >"$filename".txt
code=1
;;
ASCII*text*|ISO-8859*text*)
code=0
;;
PDF*)
code=4
pdftotext -raw "$filename" - >"$filename".txt && code=5
;;
"Rich Text Format"*)
code=6
unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7
;;
"Microsoft Office Document"*)
code=8
antiword "$filename" >"$filename".txt && code=9
;;
*"GIF image"*) code=32;;
*"JPEG image"*) code=34;;
*"PC bitmap"*) code=36;;
*"PNG image"*) code=38;;
*"TIFF image"*) code=40;;
*image*) code=46;;
esac
$verbose && echo "$filename: $ftype: $code" >&2
exit $code