Usaremos un server Centos 5.5 actualizado, con spamassassin.
-Comenzamos agregando el repositorio de dag.
Para un server i386:
rpm -Uhv http://apt.sw.be/redhat/el5/en/i386/rpmforge/RPMS/rpmforge-release-0.3.6-1.el5.rf.i386.rpm
Para un server x86-64
rpm -Uhv http://apt.sw.be/redhat/el5/en/x86_64/rpmforge/RPMS/rpmforge-release-0.3.6-1.el5.rf.x86_64.rpm
-Instalamos dependencias
yum install netpbm gifsicle giflib giflib-utils gocr ocrad ImageMagick tesseract perl-String-Approx perl-MLDBM perl-CPAN perl-MLDBM-Sync
-Descargamos la ultima version de fuzzyocr
wget -c http://users.own-hero.net/~decoder/fuzzyocr/fuzzyocr-3.6.0.tar.gz
-Descomprimimos
tar xzvf fuzzyocr-3.6.0.tar.gz
-Copiamos
cd FuzzyOcr-3.6.0
cp FuzzyOcr.cf FuzzyOcr.scansets FuzzyOcr.preps FuzzyOcr.words FuzzyOcr.pm /etc/mail/spamassassin/
mkdir /etc/mail/spamassassin/FuzzyOcr
cp FuzzyOcr/* /etc/mail/spamassassin/FuzzyOcr
-Configuramos
vi /etc/mail/spamassassin/FuzzyOcr.cf
focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words
focr_bin_helper pnmnorm, pnminvert, pamthreshold, ppmtopgm, pamtopnm
focr_bin_helper tesseract
focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin
focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps
focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets
focr_enable_image_hashing 2
focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb
focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db
focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db
focr_db_max_days 15
-Probamos este funcionando ok
cd samples
spamassassin --debug FuzzyOcr < ocr-gif.eml > /dev/null
Jul 9 17:16:44.612 [10330] dbg: FuzzyOcr: focr_bin_helper: 'pnmnorm,pnminvert,pamthreshold,ppmtopgm,pamtopnm'
Jul 9 17:16:44.613 [10330] info: FuzzyOcr: Adding <5> new helper apps
Jul 9 17:16:44.613 [10330] dbg: FuzzyOcr: focr_bin_helper: 'tesseract'
Jul 9 17:16:44.613 [10330] info: FuzzyOcr: Adding <1> new helper apps
Jul 9 17:16:44.617 [10330] info: FuzzyOcr: Starting preprocessor parser for file "/etc/mail/spamassassin/FuzzyOcr.preps"...
Jul 9 17:16:44.617 [10330] dbg: FuzzyOcr: line: preprocessor normalize {
Jul 9 17:16:44.617 [10330] dbg: FuzzyOcr: line: command = pnmnorm
Jul 9 17:16:44.617 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: preprocessor invert {
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: command = pnminvert
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: preprocessor ppmtopgm {
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: command = ppmtopgm
Jul 9 17:16:44.618 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: preprocessor pamtopnm {
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: command = pamtopnm
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: preprocessor pamthreshold {
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: command = pamthreshold
Jul 9 17:16:44.619 [10330] dbg: FuzzyOcr: line: args = -simple -threshold 0.5
Jul 9 17:16:44.620 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.620 [10330] dbg: FuzzyOcr: line: preprocessor maketiff {
Jul 9 17:16:44.620 [10330] dbg: FuzzyOcr: line: command = pnmtotiff
Jul 9 17:16:44.620 [10330] dbg: FuzzyOcr: line: args = -color -truecolor
Jul 9 17:16:44.621 [10330] dbg: FuzzyOcr: line: }
Jul 9 17:16:44.621 [10330] info: FuzzyOcr: Starting scanset parser for file "/etc/mail/spamassassin/FuzzyOcr.scansets"...
Jul 9 17:16:44.621 [10330] dbg: FuzzyOcr: line scanset ocrad {
Jul 9 17:16:44.621 [10330] dbg: FuzzyOcr: line command = $ocrad
Jul 9 17:16:44.621 [10330] dbg: FuzzyOcr: line args = -s5 $input
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line scanset ocrad-invert {
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line command = $ocrad
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line args = -s5 -i $input
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:44.622 [10330] dbg: FuzzyOcr: line scanset ocrad-decolorize-invert {
Jul 9 17:16:44.623 [10330] dbg: FuzzyOcr: line preprocessors = ppmtopgm, pamthreshold, pamtopnm
Jul 9 17:16:44.623 [10330] dbg: FuzzyOcr: line command = $ocrad
Jul 9 17:16:44.623 [10330] dbg: FuzzyOcr: line args = -s5 -i $input
Jul 9 17:16:44.623 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:44.623 [10330] dbg: FuzzyOcr: line scanset ocrad-decolorize {
Jul 9 17:16:44.624 [10330] dbg: FuzzyOcr: line preprocessors = ppmtopgm, pamthreshold, pamtopnm
Jul 9 17:16:44.624 [10330] dbg: FuzzyOcr: line command = $ocrad
Jul 9 17:16:44.624 [10330] dbg: FuzzyOcr: line args = -s5 $input
Jul 9 17:16:44.624 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:44.624 [10330] dbg: FuzzyOcr: line scanset gocr {
Jul 9 17:16:44.625 [10330] dbg: FuzzyOcr: line command = $gocr
Jul 9 17:16:44.625 [10330] dbg: FuzzyOcr: line args = -i $input
Jul 9 17:16:44.625 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:44.625 [10330] dbg: FuzzyOcr: line scanset gocr-180 {
Jul 9 17:16:44.625 [10330] dbg: FuzzyOcr: line command = $gocr
Jul 9 17:16:44.626 [10330] dbg: FuzzyOcr: line args = -l 180 -d 2 -i $input
Jul 9 17:16:44.626 [10330] dbg: FuzzyOcr: line }
Jul 9 17:16:46.698 [10330] info: FuzzyOcr: Searching in: /usr/local/netpbm/bin
Jul 9 17:16:46.698 [10330] info: FuzzyOcr: Searching in: /usr/local/bin
Jul 9 17:16:46.699 [10330] info: FuzzyOcr: Searching in: /usr/bin
Jul 9 17:16:46.699 [10330] info: FuzzyOcr: Using gifsicle => /usr/bin/gifsicle
Jul 9 17:16:46.700 [10330] info: FuzzyOcr: Using giffix => /usr/bin/giffix
Jul 9 17:16:46.700 [10330] info: FuzzyOcr: Using giftext => /usr/bin/giftext
Jul 9 17:16:46.700 [10330] info: FuzzyOcr: Using gifinter => /usr/bin/gifinter
Jul 9 17:16:46.701 [10330] info: FuzzyOcr: Using giftopnm => /usr/bin/giftopnm
Jul 9 17:16:46.701 [10330] info: FuzzyOcr: Using jpegtopnm => /usr/bin/jpegtopnm
Jul 9 17:16:46.701 [10330] info: FuzzyOcr: Using pngtopnm => /usr/bin/pngtopnm
Jul 9 17:16:46.702 [10330] info: FuzzyOcr: Using bmptopnm => /usr/bin/bmptopnm
Jul 9 17:16:46.702 [10330] info: FuzzyOcr: Using tifftopnm => /usr/bin/tifftopnm
Jul 9 17:16:46.703 [10330] info: FuzzyOcr: Using ppmhist => /usr/bin/ppmhist
Jul 9 17:16:46.703 [10330] info: FuzzyOcr: Using pamfile => /usr/bin/pamfile
Jul 9 17:16:46.703 [10330] info: FuzzyOcr: Using ocrad => /usr/bin/ocrad
Jul 9 17:16:46.704 [10330] info: FuzzyOcr: Using gocr => /usr/bin/gocr
Jul 9 17:16:46.704 [10330] info: FuzzyOcr: Using pnmnorm => /usr/bin/pnmnorm
Jul 9 17:16:46.705 [10330] info: FuzzyOcr: Using pnminvert => /usr/bin/pnminvert
Jul 9 17:16:46.705 [10330] info: FuzzyOcr: Using pamthreshold => /usr/bin/pamthreshold
Jul 9 17:16:46.705 [10330] info: FuzzyOcr: Using ppmtopgm => /usr/bin/ppmtopgm
Jul 9 17:16:46.706 [10330] info: FuzzyOcr: Using pamtopnm => /usr/bin/pamtopnm
Jul 9 17:16:46.706 [10330] info: FuzzyOcr: Using tesseract => /usr/bin/tesseract
Jul 9 17:16:46.706 [10330] dbg: FuzzyOcr: Threshold[max_hash] => 5
Jul 9 17:16:46.707 [10330] dbg: FuzzyOcr: Threshold[c] => 5
Jul 9 17:16:46.707 [10330] dbg: FuzzyOcr: Threshold[s] => 0.01
Jul 9 17:16:46.707 [10330] dbg: FuzzyOcr: Threshold[w] => 0.01
Jul 9 17:16:46.708 [10330] dbg: FuzzyOcr: Threshold[cn] => 0.01
Jul 9 17:16:46.708 [10330] dbg: FuzzyOcr: Threshold[h] => 0.01
Jul 9 17:16:46.709 [10330] dbg: FuzzyOcr: focr_add_score => 1
Jul 9 17:16:46.709 [10330] dbg: FuzzyOcr: focr_autodisable_negative_score => -5
Jul 9 17:16:46.709 [10330] dbg: FuzzyOcr: focr_autodisable_score => 1000
Jul 9 17:16:46.710 [10330] dbg: FuzzyOcr: focr_autosort_buffer => 10
Jul 9 17:16:46.710 [10330] dbg: FuzzyOcr: focr_autosort_scanset => 1
Jul 9 17:16:46.710 [10330] dbg: FuzzyOcr: focr_base_score => 5
Jul 9 17:16:46.711 [10330] dbg: FuzzyOcr: focr_corrupt_score => 2.5
Jul 9 17:16:46.711 [10330] dbg: FuzzyOcr: focr_corrupt_unfixable_score => 5
Jul 9 17:16:46.711 [10330] dbg: FuzzyOcr: focr_counts_required => 2
Jul 9 17:16:46.712 [10330] dbg: FuzzyOcr: focr_db_hash => /etc/mail/spamassassin/FuzzyOcr.db
Jul 9 17:16:46.712 [10330] dbg: FuzzyOcr: focr_db_max_days => 15
Jul 9 17:16:46.712 [10330] dbg: FuzzyOcr: focr_db_safe => /etc/mail/spamassassin/FuzzyOcr.safe.db
Jul 9 17:16:46.713 [10330] dbg: FuzzyOcr: focr_digest_db => /etc/mail/spamassassin/FuzzyOcr.hashdb
Jul 9 17:16:46.713 [10330] dbg: FuzzyOcr: focr_enable_image_hashing => 2
Jul 9 17:16:46.713 [10330] dbg: FuzzyOcr: focr_global_timeout => 0
Jul 9 17:16:46.714 [10330] dbg: FuzzyOcr: focr_global_wordlist => /etc/mail/spamassassin/FuzzyOcr.words
Jul 9 17:16:46.714 [10330] dbg: FuzzyOcr: focr_hashing_learn_scanned => 1
Jul 9 17:16:46.714 [10330] dbg: FuzzyOcr: focr_keep_bad_images => 0
Jul 9 17:16:46.715 [10330] dbg: FuzzyOcr: focr_log_pmsinfo => 1
Jul 9 17:16:46.715 [10330] dbg: FuzzyOcr: focr_log_stderr => 1
Jul 9 17:16:46.715 [10330] dbg: FuzzyOcr: focr_max_height => 800
Jul 9 17:16:46.716 [10330] dbg: FuzzyOcr: focr_max_width => 800
Jul 9 17:16:46.716 [10330] dbg: FuzzyOcr: focr_min_height => 4
Jul 9 17:16:46.716 [10330] dbg: FuzzyOcr: focr_min_width => 4
Jul 9 17:16:46.717 [10330] dbg: FuzzyOcr: focr_minimal_scanset => 1
Jul 9 17:16:46.717 [10330] dbg: FuzzyOcr: focr_mysql_db => FuzzyOcr
Jul 9 17:16:46.717 [10330] dbg: FuzzyOcr: focr_mysql_hash => Hash
Jul 9 17:16:46.718 [10330] dbg: FuzzyOcr: focr_mysql_host => localhost
Jul 9 17:16:46.718 [10330] dbg: FuzzyOcr: focr_mysql_port => 3306
Jul 9 17:16:46.718 [10330] dbg: FuzzyOcr: focr_mysql_safe => Safe
Jul 9 17:16:46.718 [10330] dbg: FuzzyOcr: focr_mysql_update_hash => 0
Jul 9 17:16:46.719 [10330] dbg: FuzzyOcr: focr_mysql_user => fuzzyocr
Jul 9 17:16:46.719 [10330] dbg: FuzzyOcr: focr_no_homedirs => 0
Jul 9 17:16:46.719 [10330] dbg: FuzzyOcr: focr_path_bin => /usr/local/netpbm/bin:/usr/local/bin:/usr/bin
Jul 9 17:16:46.720 [10330] dbg: FuzzyOcr: focr_pdf_maxpages => 1
Jul 9 17:16:46.720 [10330] dbg: FuzzyOcr: focr_personal_wordlist => __userstate__/FuzzyOcr.words
Jul 9 17:16:46.720 [10330] dbg: FuzzyOcr: focr_preprocessor_file => /etc/mail/spamassassin/FuzzyOcr.preps
Jul 9 17:16:46.721 [10330] dbg: FuzzyOcr: focr_scan_pdfs => 0
Jul 9 17:16:46.721 [10330] dbg: FuzzyOcr: focr_scanset_file => /etc/mail/spamassassin/FuzzyOcr.scansets
Jul 9 17:16:46.721 [10330] dbg: FuzzyOcr: focr_score_ham => 0
Jul 9 17:16:46.722 [10330] dbg: FuzzyOcr: focr_skip_bmp => 0
Jul 9 17:16:46.722 [10330] dbg: FuzzyOcr: focr_skip_gif => 0
Jul 9 17:16:46.722 [10330] dbg: FuzzyOcr: focr_skip_jpeg => 0
Jul 9 17:16:46.723 [10330] dbg: FuzzyOcr: focr_skip_png => 0
Jul 9 17:16:46.723 [10330] dbg: FuzzyOcr: focr_skip_tiff => 0
Jul 9 17:16:46.723 [10330] dbg: FuzzyOcr: focr_skip_updates => 0
Jul 9 17:16:46.723 [10330] dbg: FuzzyOcr: focr_strip_numbers => 1
Jul 9 17:16:46.724 [10330] dbg: FuzzyOcr: focr_threshold => 0.25
Jul 9 17:16:46.724 [10330] dbg: FuzzyOcr: focr_timeout => 10
Jul 9 17:16:46.724 [10330] dbg: FuzzyOcr: focr_twopass_scoring_factor => 1.5
Jul 9 17:16:46.725 [10330] dbg: FuzzyOcr: focr_unique_matches => 0
Jul 9 17:16:46.725 [10330] dbg: FuzzyOcr: focr_verbose => 1
Jul 9 17:16:46.725 [10330] dbg: FuzzyOcr: focr_wrongctype_score => 1.5
Jul 9 17:16:46.725 [10330] dbg: FuzzyOcr: focr_wrongext_score => 1.5
Jul 9 17:16:46.726 [10330] info: FuzzyOcr: Loaded preprocessor normalize: /usr/bin/pnmnorm
Jul 9 17:16:46.726 [10330] info: FuzzyOcr: Loaded preprocessor invert: /usr/bin/pnminvert
Jul 9 17:16:46.727 [10330] info: FuzzyOcr: Loaded preprocessor ppmtopgm: /usr/bin/ppmtopgm
Jul 9 17:16:46.727 [10330] info: FuzzyOcr: Loaded preprocessor pamtopnm: /usr/bin/pamtopnm
Jul 9 17:16:46.727 [10330] info: FuzzyOcr: Loaded preprocessor pamthreshold: /usr/bin/pamthreshold -simple -threshold 0.5
Jul 9 17:16:46.728 [10330] info: FuzzyOcr: Loaded preprocessor maketiff: pnmtotiff -color -truecolor
Jul 9 17:16:46.728 [10330] info: FuzzyOcr: Using scan ocrad: /usr/bin/ocrad -s5 $input
Jul 9 17:16:46.729 [10330] info: FuzzyOcr: Using scan ocrad-invert: /usr/bin/ocrad -s5 -i $input
Jul 9 17:16:46.729 [10330] info: FuzzyOcr: Using scan ocrad-decolorize-invert: /usr/bin/ocrad -s5 -i $input
Jul 9 17:16:46.729 [10330] info: FuzzyOcr: Using scan ocrad-decolorize: /usr/bin/ocrad -s5 $input
Jul 9 17:16:46.730 [10330] info: FuzzyOcr: Using scan gocr: /usr/bin/gocr -i $input
Jul 9 17:16:46.730 [10330] info: FuzzyOcr: Using scan gocr-180: /usr/bin/gocr -l 180 -d 2 -i $input
Jul 9 17:16:46.731 [10330] info: FuzzyOcr: Added <43> words from "/etc/mail/spamassassin/FuzzyOcr.words"
Jul 9 17:16:54.025 [10330] info: pyzor: [10345] error: TERMINATED, signal 15 (000f)
Jul 9 17:16:54.374 [10330] dbg: FuzzyOcr: Starting FuzzyOcr...
Jul 9 17:16:54.375 [10330] info: FuzzyOcr: Processing Message with ID "
Jul 9 17:16:54.376 [10330] dbg: FuzzyOcr: fname: "sbillet" => "sbillet"
Jul 9 17:16:54.377 [10330] info: FuzzyOcr: GIF: [327x549] sbillet (7239)
Jul 9 17:16:54.379 [10330] dbg: FuzzyOcr: Saved: /tmp/.spamassassin10330phhpgvtmp/sbillet
Jul 9 17:16:54.380 [10330] dbg: FuzzyOcr: Saved: /tmp/.spamassassin10330phhpgvtmp/raw.eml
Jul 9 17:16:54.380 [10330] info: FuzzyOcr: Found: 1 images
Jul 9 17:16:54.381 [10330] dbg: FuzzyOcr: pfile => /tmp/.spamassassin10330phhpgvtmp/sbillet.pnm
Jul 9 17:16:54.381 [10330] dbg: FuzzyOcr: efile => /tmp/.spamassassin10330phhpgvtmp/sbillet.err
Jul 9 17:16:54.382 [10330] dbg: FuzzyOcr: Errors to: /tmp/.spamassassin10330phhpgvtmp/raw.err
Jul 9 17:16:54.382 [10330] dbg: FuzzyOcr: File has Content-Type "image/jpeg" and no File Extension
Jul 9 17:16:54.382 [10330] info: FuzzyOcr: Found GIF header name="sbillet"
Jul 9 17:16:54.383 [10330] info: FuzzyOcr: Image has format "GIF" but content-type is "image/jpeg"
Jul 9 17:16:54.410 [10346] dbg: FuzzyOcr: Exec : /usr/bin/giftext /tmp/.spamassassin10330phhpgvtmp/sbillet
Jul 9 17:16:54.413 [10346] dbg: FuzzyOcr: Stdout: >/tmp/.spamassassin10330phhpgvtmp/giftext.info
Jul 9 17:16:54.413 [10346] dbg: FuzzyOcr: Stderr: >>/tmp/.spamassassin10330phhpgvtmp/giftext.err
Jul 9 17:16:54.413 [10330] dbg: FuzzyOcr: Saved pid: 10346
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
Jul 9 17:16:54.425 [10330] dbg: FuzzyOcr: Elapsed [10346]: 0.038535 sec. (/usr/bin/giftext: exit 8)
Jul 9 17:16:54.426 [10330] warn: readline() on closed filehandle INFILE at ../FuzzyOcr/Misc.pm line 205.
Jul 9 17:16:54.427 [10330] info: FuzzyOcr: Image is single non-interlaced...
Jul 9 17:16:54.447 [10347] dbg: FuzzyOcr: Exec : /usr/bin/giffix /tmp/.spamassassin10330phhpgvtmp/sbillet
Jul 9 17:16:54.450 [10347] dbg: FuzzyOcr: Stdout: >/tmp/.spamassassin10330phhpgvtmp/sbillet-fixed.gif
Jul 9 17:16:54.450 [10347] dbg: FuzzyOcr: Stderr: >>/tmp/.spamassassin10330phhpgvtmp/sbillet.err
Jul 9 17:16:54.452 [10330] dbg: FuzzyOcr: Saved pid: 10347
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
Jul 9 17:16:54.461 [10330] dbg: FuzzyOcr: Elapsed [10347]: 0.031494 sec. (/usr/bin/giffix: exit 8)
Jul 9 17:16:54.483 [10348] dbg: FuzzyOcr: Exec : /usr/bin/giftopnm /tmp/.spamassassin10330phhpgvtmp/sbillet-fixed.gif
Jul 9 17:16:54.484 [10330] dbg: FuzzyOcr: Saved pid: 10348
Jul 9 17:16:54.485 [10348] dbg: FuzzyOcr: Stdout: >/tmp/.spamassassin10330phhpgvtmp/sbillet.pnm
Jul 9 17:16:54.485 [10348] dbg: FuzzyOcr: Stderr: >>/tmp/.spamassassin10330phhpgvtmp/sbillet.err
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
save_execute: Insecure dependency in open while running with -T switch at ../FuzzyOcr/Misc.pm line 92.
Jul 9 17:16:54.496 [10330] dbg: FuzzyOcr: Elapsed [10348]: 0.031541 sec. (/usr/bin/giftopnm: exit 8)
Jul 9 17:16:54.498 [10330] error: FuzzyOcr: /usr/bin/giftopnm: Returned [2048], skipping...
Jul 9 17:16:54.500 [10330] dbg: FuzzyOcr: Remove DIR: /tmp/.spamassassin10330phhpgvtmp
Jul 9 17:16:54.500 [10330] dbg: FuzzyOcr: FuzzyOcr ending successfully...
Jul 9 17:16:54.501 [10330] dbg: FuzzyOcr: Processed in 0.126789 sec.