#!/bin/sh
#
# Audit the "dictionary" of PCP "words" used by man-spell
#
# Copyright (c) 2024 Ken McDonell, Inc.  All Rights Reserved.
#

usage="Usage: audit-man-dict [-d] [-i seed-file] [-[sS] save-file] [-D save-dict] [man-src-file ...]"

export LC_COLLATE=POSIX

if which ispell >/dev/null 2>&1
then
    :
else
    echo "Arrgh: ispell not installed, no dice"
    exit 1
fi

if which man-spell >/dev/null 2>&1
then
    :
else
    echo "Arrgh: man-spell not on $PATH, no dice"
    exit 1
fi

tmp=/tmp/audit-man-spell-$$
rm -f $tmp.*
status=0	# success is the default
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

# file(1) does not work, so need some heuristics
# ... assume existance of $1 already established
#
_istroff()
{
    # shell or other script?
    head -1 "$1" | grep -q '^#!' && return 1
    # generated by podman?
    head -1 "$1" | grep -q 'by Pod::Man' && return 1
    # -man macros?
    grep -q '^\.SH ' <"$1" && return 0
    return 1
}

seed=''
save=''
debug=0
save_dict=''
while getopts "dD:i:s:S:?" c
do
    case $c
    in
	d)	debug=`expr $debug + 1`
		;;
	D)	# save dictionary strings
		save_dict="$OPTARG"
		;;
	i)	# seed "words" to start with, probably the result of a
		# prior run and saved with -s
		#
		if [ ! -f "$OPTARG" ]
		then
		    echo "Error: $OPTARG not found for -i"
		    status=1
		    exit
		fi
		seed="$OPTARG"
		;;
	s|S)	# save "words" == -i file if any, + new ones from
		# this run, all sorted and duplicate removed
		# -s save-file is precious
		# -S clobber an existing save-file
		#
		if [ "$c" = s -a -f "$OPTARG" ]
		then
		    echo "Error: $OPTARG already exists for -s"
		    status=1
		    exit
		fi
		save="$OPTARG"
		;;
	?)	echo >&2 "$usage"
		exit
		;;
    esac
done
shift `expr $OPTIND - 1`

# really only works sanely if you're at the top of the man
# pages ...
#
case `pwd`
in
    */man)	;;
    *)		echo >&2 "Warning: pwd: `pwd` is not the top of the man dirs"
    		;;
esac

# Pass 1
# - add "seed" words if any to "words" file
# - for each input file
#   + break into one "word" per line using a crude hack:
#     strip troff in-line find changes, then not alphabetic
#     and not _ and not ' => \n, then strip leading
#     or trailing ' (leaving behind posessive forms like 
#   + append to "words" file
# - sort | uniq "words" file
# - if -s, save "words" file
#
if [ -n "$seed" ]
then
    cp "$seed" $tmp.words
else
    touch $tmp.words
fi
if [ $# -gt 0 ]
then
    for arg; do echo $arg; done
else
    find * -name '*.[1-9]*' \( -type f -o -type l \)
fi \
| while read file
do
    if [ ! -f "$file" ]
    then
	echo "$file: not found"
	continue
    fi
    if _istroff "$file"
    then
	# smells like troff man source ...
	#
	rm -f $tmp.errs
	[ $debug -gt 0 ] && echo "$file:"
	sed <"$file" \
	    -e 's/\\f[A-Z]//g' \
	    -e 's/\\f(..//g' \
	| tr -c "A-Za-z_'" '\012' \
	| sed >>$tmp.words \
	    -e "s/^''*//" \
	    -e "s/''*\$//" \
	# end
    else
	[ $debug -gt 0 ] && echo >&2 "$file: skipped (not troff)"
    fi
done

sort <$tmp.words \
| uniq >$tmp.tmp
mv $tmp.tmp $tmp.words
[ -n "$save" ] && cp $tmp.words "$save"

# Pass 2
# - get the "dictionary" of PCP words from man-spell for an empty input
#   file
#
echo >$tmp.tmp
man-spell -d $tmp.tmp 2>&1 \
| sed \
    -e '/Common words/d' \
    -e '/ skipped (not troff)/d' \
    -e '/^$/q' \
| tr ' ' '\012' \
| sed -e '/^$/d' \
| sort \
| uniq >$tmp.tmp

# break dictionary into strings and patterns
#
awk <$tmp.tmp >$tmp.strings '
/\[/ || /\?/	{ print >"'$tmp.patterns'"; next }
		{ print }'
[ -n "$save_dict" ] && cp $tmp.strings "$save_dict"

grep "[^A-Za-z_']" $tmp.strings >$tmp.tmp
if [ -s $tmp.tmp ]
then
    echo "Warning: dodgey? 'cause ispell won't parse text into these words ..."
    cat $tmp.tmp
fi
echo "`wc -l <$tmp.strings | sed -e 's/ //g'` strings in the PCP dictionary"
echo "`wc -l <$tmp.patterns | sed -e 's/ //g'` patterns in the PCP dictionary"
sed <$tmp.patterns >$tmp.sed -e 's;.*;/^&$/p;'
#debug# cat $tmp.sed

# $tmp.words		- all the "words" in the man page(s)
# 	$tmp.strings	- strings from PCP dictionary
#
comm -1 -2 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary string matches"
#debug# cat $tmp.tmp

# dictionary strings that match no words in any man page
#
comm -1 -3 $tmp.words $tmp.strings >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary strings not in any man page"
[ "$debug" -gt 0 ] && cat $tmp.tmp

sed -E -n -f $tmp.sed <$tmp.words >$tmp.tmp
echo "`wc -l <$tmp.tmp | sed -e 's/ //'g` dictionary pattern matches"
#debug# cat $tmp.tmp

# strings that match no patterns - TODO

exit
