| 1 |
237 |
dpavlin |
#!/bin/bash |
| 2 |
|
|
#/* |
| 3 |
|
|
# openisis - an open implementation of the CDS/ISIS database |
| 4 |
|
|
# Version 0.8.x (patchlevel see file Version) |
| 5 |
|
|
# Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org |
| 6 |
|
|
# |
| 7 |
|
|
# This library is free software; you can redistribute it and/or |
| 8 |
|
|
# modify it under the terms of the GNU Lesser General Public |
| 9 |
|
|
# License as published by the Free Software Foundation; either |
| 10 |
|
|
# version 2.1 of the License, or (at your option) any later version. |
| 11 |
|
|
# |
| 12 |
|
|
# This library is distributed in the hope that it will be useful, |
| 13 |
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 |
|
|
# Lesser General Public License for more details. |
| 16 |
|
|
# |
| 17 |
|
|
# You should have received a copy of the GNU Lesser General Public |
| 18 |
|
|
# License along with this library; if not, write to the Free Software |
| 19 |
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 20 |
|
|
# |
| 21 |
|
|
# see README for more information |
| 22 |
|
|
#EOH */ |
| 23 |
|
|
# |
| 24 |
|
|
# |
| 25 |
|
|
CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $' |
| 26 |
|
|
# |
| 27 |
|
|
# utility to convert several document types to text |
| 28 |
|
|
# used for mail archiving |
| 29 |
|
|
# similar to anytopnm |
| 30 |
|
|
# |
| 31 |
|
|
|
| 32 |
|
|
# assumptions: |
| 33 |
|
|
# filename is a name of the file in the current directory |
| 34 |
|
|
# we are allowed to use any filename.* |
| 35 |
|
|
# |
| 36 |
|
|
|
| 37 |
|
|
# return code: |
| 38 |
|
|
# the lowest valued bit denotes that there is a plain version in filename.txt |
| 39 |
|
|
# |
| 40 |
|
|
# 0 file was fine text and is unmodified |
| 41 |
|
|
# 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt |
| 42 |
|
|
# 2 file was formatted html text |
| 43 |
|
|
# 3 file was formatted html text, plain version is filename.txt |
| 44 |
|
|
# 4,5 likewise for pdf |
| 45 |
|
|
# 6,7 likewise for rtf |
| 46 |
|
|
# 8,9 likewise for doc (M$-w0rd) |
| 47 |
|
|
# 16,17 file was mail text |
| 48 |
|
|
# 32 gif image |
| 49 |
|
|
# 34 jpg image |
| 50 |
|
|
# 36 bmp image |
| 51 |
|
|
# 38 png image |
| 52 |
|
|
# 40 tiff image |
| 53 |
|
|
# 46 other image |
| 54 |
|
|
# ?? other |
| 55 |
|
|
# 124 unknown |
| 56 |
|
|
# 126 error |
| 57 |
|
|
# |
| 58 |
|
|
|
| 59 |
|
|
verbose=false |
| 60 |
|
|
|
| 61 |
|
|
err () { |
| 62 |
|
|
echo "$*" >&2 |
| 63 |
|
|
exit 126 |
| 64 |
|
|
} |
| 65 |
|
|
|
| 66 |
|
|
dewindoofy () { |
| 67 |
|
|
tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY' |
| 68 |
|
|
} |
| 69 |
|
|
|
| 70 |
|
|
echo "any2txt $* ($PWD)" >&2 |
| 71 |
|
|
|
| 72 |
|
|
while [ 1 -lt $# ]; do # shift options |
| 73 |
|
|
case "$1" in |
| 74 |
|
|
-v) verbose=true;; |
| 75 |
|
|
-m) mtype="$2"; shift;; |
| 76 |
|
|
esac |
| 77 |
|
|
shift |
| 78 |
|
|
done |
| 79 |
|
|
|
| 80 |
|
|
# if $verbose; then |
| 81 |
|
|
# : |
| 82 |
|
|
#else |
| 83 |
|
|
# no output expected; so any output is error |
| 84 |
|
|
exec >&2 |
| 85 |
|
|
#fi |
| 86 |
|
|
|
| 87 |
|
|
filename="$1" |
| 88 |
|
|
[ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable" |
| 89 |
|
|
|
| 90 |
|
|
ftype="$(file -b "$filename")" |
| 91 |
|
|
|
| 92 |
|
|
code=124 |
| 93 |
|
|
case "$ftype" in |
| 94 |
|
|
*"mail text"*) |
| 95 |
|
|
sed -e '1,/^$/d' <"$filename" >"$filename".txt |
| 96 |
|
|
code=17 |
| 97 |
|
|
;; |
| 98 |
|
|
*HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII |
| 99 |
|
|
code=2 |
| 100 |
|
|
lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3 |
| 101 |
|
|
;; |
| 102 |
|
|
*"Non-ISO extended-ASCII"*"long lines") # assume windows stuff |
| 103 |
|
|
dewindoofy <"$filename" | fmt -s >"$filename".txt |
| 104 |
|
|
code=1 |
| 105 |
|
|
;; |
| 106 |
|
|
*"Non-ISO extended-ASCII"*) # assume windows stuff |
| 107 |
|
|
dewindoofy <"$filename" >"$filename".txt |
| 108 |
|
|
code=1 |
| 109 |
|
|
;; |
| 110 |
|
|
ASCII*text*"long lines"|ISO-8859*text*"long lines") |
| 111 |
|
|
fmt -s <"$filename" >"$filename".txt |
| 112 |
|
|
code=1 |
| 113 |
|
|
;; |
| 114 |
|
|
ASCII*text*|ISO-8859*text*) |
| 115 |
|
|
code=0 |
| 116 |
|
|
;; |
| 117 |
|
|
PDF*) |
| 118 |
|
|
code=4 |
| 119 |
|
|
pdftotext -raw "$filename" - >"$filename".txt && code=5 |
| 120 |
|
|
;; |
| 121 |
|
|
"Rich Text Format"*) |
| 122 |
|
|
code=6 |
| 123 |
|
|
unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7 |
| 124 |
|
|
;; |
| 125 |
|
|
"Microsoft Office Document"*) |
| 126 |
|
|
code=8 |
| 127 |
|
|
antiword "$filename" >"$filename".txt && code=9 |
| 128 |
|
|
;; |
| 129 |
|
|
*"GIF image"*) code=32;; |
| 130 |
|
|
*"JPEG image"*) code=34;; |
| 131 |
|
|
*"PC bitmap"*) code=36;; |
| 132 |
|
|
*"PNG image"*) code=38;; |
| 133 |
|
|
*"TIFF image"*) code=40;; |
| 134 |
|
|
*image*) code=46;; |
| 135 |
|
|
esac |
| 136 |
|
|
|
| 137 |
|
|
$verbose && echo "$filename: $ftype: $code" >&2 |
| 138 |
|
|
|
| 139 |
|
|
exit $code |