/[webpac]/trunk/openisis/any2txt
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/openisis/any2txt

Parent Directory Parent Directory | Revision Log Revision Log


Revision 239 - (show annotations)
Mon Mar 8 17:49:13 2004 UTC (20 years ago) by dpavlin
File size: 3425 byte(s)
including openisis 0.9.0 into webpac tree

1 #!/bin/bash
2 #/*
3 # openisis - an open implementation of the CDS/ISIS database
4 # Version 0.8.x (patchlevel see file Version)
5 # Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
6 #
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
11 #
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
16 #
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #
21 # see README for more information
22 #EOH */
23 #
24 #
25 CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $'
26 #
27 # utility to convert several document types to text
28 # used for mail archiving
29 # similar to anytopnm
30 #
31
32 # assumptions:
33 # filename is a name of the file in the current directory
34 # we are allowed to use any filename.*
35 #
36
37 # return code:
38 # the lowest valued bit denotes that there is a plain version in filename.txt
39 #
40 # 0 file was fine text and is unmodified
41 # 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt
42 # 2 file was formatted html text
43 # 3 file was formatted html text, plain version is filename.txt
44 # 4,5 likewise for pdf
45 # 6,7 likewise for rtf
46 # 8,9 likewise for doc (M$-w0rd)
47 # 16,17 file was mail text
48 # 32 gif image
49 # 34 jpg image
50 # 36 bmp image
51 # 38 png image
52 # 40 tiff image
53 # 46 other image
54 # ?? other
55 # 124 unknown
56 # 126 error
57 #
58
59 verbose=false
60
61 err () {
62 echo "$*" >&2
63 exit 126
64 }
65
66 dewindoofy () {
67 tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY'
68 }
69
70 echo "any2txt $* ($PWD)" >&2
71
72 while [ 1 -lt $# ]; do # shift options
73 case "$1" in
74 -v) verbose=true;;
75 -m) mtype="$2"; shift;;
76 esac
77 shift
78 done
79
80 # if $verbose; then
81 # :
82 #else
83 # no output expected; so any output is error
84 exec >&2
85 #fi
86
87 filename="$1"
88 [ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable"
89
90 ftype="$(file -b "$filename")"
91
92 code=124
93 case "$ftype" in
94 *"mail text"*)
95 sed -e '1,/^$/d' <"$filename" >"$filename".txt
96 code=17
97 ;;
98 *HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII
99 code=2
100 lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3
101 ;;
102 *"Non-ISO extended-ASCII"*"long lines") # assume windows stuff
103 dewindoofy <"$filename" | fmt -s >"$filename".txt
104 code=1
105 ;;
106 *"Non-ISO extended-ASCII"*) # assume windows stuff
107 dewindoofy <"$filename" >"$filename".txt
108 code=1
109 ;;
110 ASCII*text*"long lines"|ISO-8859*text*"long lines")
111 fmt -s <"$filename" >"$filename".txt
112 code=1
113 ;;
114 ASCII*text*|ISO-8859*text*)
115 code=0
116 ;;
117 PDF*)
118 code=4
119 pdftotext -raw "$filename" - >"$filename".txt && code=5
120 ;;
121 "Rich Text Format"*)
122 code=6
123 unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7
124 ;;
125 "Microsoft Office Document"*)
126 code=8
127 antiword "$filename" >"$filename".txt && code=9
128 ;;
129 *"GIF image"*) code=32;;
130 *"JPEG image"*) code=34;;
131 *"PC bitmap"*) code=36;;
132 *"PNG image"*) code=38;;
133 *"TIFF image"*) code=40;;
134 *image*) code=46;;
135 esac
136
137 $verbose && echo "$filename: $ftype: $code" >&2
138
139 exit $code

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26