;;; @Package UrlSkip ;;; @Subtitle Web URL Simplification in Scheme ;;; @HomePage http://www.neilvandyke.org/urlskip/ ;;; @Author Neil W. Van Dyke ;;; @AuthorEmail neil@@neilvandyke.org ;;; @Version 0.1 ;;; @Date 3 January 2005 ;; $Id: urlskip.scm,v 1.67 2005/01/03 05:09:12 neil Exp $ ;;; @legal ;;; Copyright @copyright{} 2005 Neil W. Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU General Public License as published by the Free Software Foundation; ;;; either version 2 of the License, or (at your option) any later version. ;;; This program is distributed in the hope that it will be useful, but without ;;; any warranty; without even the implied warranty of merchantability or ;;; fitness for a particular purpose. See the GNU General Public License [GPL] ;;; for details. For other license options and commercial consulting, contact ;;; the author. ;;; @end legal (require (lib "uri.ss" "uri")) ;; (require (lib "testeez.ss" "testeez")) ;;; @section Introduction ;;; ;;; The UrlSkip Scheme library provides a function that translates some of the ;;; Web URLs that might be used to track a user across sites, by removing ;;; intermediate HTTP redirectors or information that might identify the user. ;;; Such a function might be used as part of a privacy-enhancing Web browser, ;;; or to canonicalize or un-obfuscate URLs for Web analysis projects. ;;; ;;; Note that UrlSkip is not intended to remove information used by ;;; ``affiliate'' referral programs to identify site operators that have sent ;;; users to a site. However, in some cases this affiliate ID information ;;; might be lost in the process of removing a intermediary URL that is used by ;;; a third party to track and profile users. ;;; ;;; UrlSkip currently requires R5RS, the [uri.scm] library, and a particular ;;; regular expression function. Therefore, UrlSkip currently works only with ;;; PLT MzScheme, although it will be made more portable once @code{uri.scm} ;;; is. ;;; ;;; UrlSkip is released under the GPL license, unlike most of the author's ;;; other released Scheme libraries, which are LGPL. (define (urlskip-internal:rsegs/slashok uriobj) (uripath-segments/reverse (uriobj-uripath/noparams uriobj))) (define (urlskip-internal:rsegs/noslash uriobj) (let ((rsegs (urlskip-internal:rsegs/slashok uriobj))) (if (or (null? rsegs) (car rsegs)) rsegs (cdr rsegs)))) ;;; @section Host Handlers ;;; ;;; The procedures in this section are used internally by the @code{urlskip} ;;; procedure, and correspond to particular HTTP server hostnames. They are ;;; exposed here mainly for purposes of documentation, and are likely to change ;;; in future versions of UrlSkip. Each procedure accepts a @i{uriobj} and ;;; yields either a new URL string of a simpler URL, or @code{#f} if no simpler ;;; URL was determined. ;;; @defproc urlskip-http-ad-doubleclick-net uriobj ;;; ;;; UrlSkips @code{http://ad.doubleclick.net}:@* ;;; Substring following @code{;;~sscs=%3f}. (define (urlskip-http-ad-doubleclick-net uriobj) (let ((str (uriobj->string uriobj))) (cond ((regexp-match-positions #rx";;~sscs=%3f(.*)" str) => (lambda (m) (let ((pos (cadr m))) ;; Note: This shouldn't need unescaping, but maybe it does. (substring str (car pos) (cdr pos))))) (else #f)))) ;;; @defproc urlskip-http-click-linksynergy-com uriobj ;;; ;;; UrlSkips @code{http://click.linksynergy.com}:@* ;;; If path @code{/fs-bin/stat}, then query value @code{RD_PARM1} or ;;; @code{rd_parm1}. (define (urlskip-http-click-linksynergy-com uriobj) (cond ((equal? (urlskip-internal:rsegs/noslash uriobj) '("stat" "fs-bin")) (let ((q (uriobj-uriquery uriobj))) (cond ((or (uriquery-value q "RD_PARM1") (uriquery-value q "rd_parm1")) => (lambda (s) (urlskip (uri-unescape/shareok s)))) (else #f)))) (else #f))) ;;; @defproc urlskip-http-rds-yahoo-com uriobj ;;; ;;; UrlSkips @code{http://rds.yahoo.com}:@* ;;; Substring of the @code{http} URL following @code{*-}. (define (urlskip-http-rds-yahoo-com uriobj) ;; TODO: Rather than trying to do this in R5RS from uri.scm parse output, we ;; should probably just regexp-match around "/\*-" and uri-unescape to pick ;; out the URL, since uri.scm has a dependency on regexps anyway. (let loop ((rs (urlskip-internal:rsegs/slashok uriobj)) (out '())) (if (null? rs) #f (let ((head (car rs))) (if (equal? head "*-http:") (apply string-append "http:" out) ;; Note: It's OK to cons here on a search pass, since ;; "rds.yahoo.com" should always find something (we're not ;; consing unnecessarily). (loop (cdr rs) (cons "/" (cons (if head (uri-escape/shareok head) "") out)))))))) ;;; @defproc urlskip-http-service-netmeans-com uriobj ;;; ;;; UrlSkips @code{http://service.netmeans.com}:@* ;;; If path @code{/bfast/click}, then query value @code{loc}. (define (urlskip-http-service-netmeans-com uriobj) (cond ((equal? (urlskip-internal:rsegs/noslash uriobj) '("click" "bfast")) (cond ((uri-query-value uriobj "loc") => urlskip) (else #f))) (else #f))) ;;; @defproc urlskip-http-web-ask-com uriobj ;;; ;;; UrlSkips @code{http://web.ask.com}:@* ;;; If path @code{/redir}, then query value @code{bu}. (define (urlskip-http-web-ask-com uriobj) (cond ((equal? (urlskip-internal:rsegs/noslash uriobj) '("redir")) (cond ((uri-query-value uriobj "bu") => urlskip) (else #f))) (else #f))) ;;; @defproc urlskip-http-www-amazon-com uriobj ;;; ;;; UrlSkips @code{http://www.amazon.com}:@* ;;; If path @code{/exec/obidos/redirect}, then remove all query values except ;;; for @code{tag} and @code{path}. (define (urlskip-http-www-amazon-com uriobj) (let ((rs (urlskip-internal:rsegs/noslash uriobj))) (cond ((equal? rs '("redirect" "obidos" "exec")) (let ((q (uriobj-uriquery uriobj))) (cond ((uriquery-value q "path") => (lambda (path) (string-append "http://www.amazon.com/exec/obidos/redirect?" (cond ((uriquery-value q "tag") => (lambda (tag) (string-append "tag=" (uri-escape tag) "&"))) (else "")) "path=" ;; Note: they do not URI-escape "path". path))) (else #f)))) ;; TODO: Match other "http://www.amazon.com" formats here. (else #f)))) ;;; @defproc urlskip-http-www-anrdoezrs-net uriobj ;;; ;;; UrlSkips @code{http://www.anrdoezrs.net}:@* ;;; Query value @code{url}. (define (urlskip-http-www-anrdoezrs-net uriobj) (cond ((uri-query-value uriobj "url") => urlskip) (else #f))) ;;; @defproc urlskip-http-www-commission-junction-com uriobj ;;; ;;; UrlSkips @code{http://www.commission-junction.com}:@* ;;; If path @code{/track/track.dll}, then query value @code{URL}. (define (urlskip-http-www-commission-junction-com uriobj) (cond ((equal? (urlskip-internal:rsegs/noslash uriobj) '("track.dll" "track")) (cond ((uri-query-value uriobj "URL") => urlskip) (else #f))) (else #f))) ;;; @defproc urlskip-http-www-google-com uriobj ;;; ;;; UrlSkips @code{http://www.google.com}:@* ;;; If path @code{/pagead/iclk}, then query value @code{adurl}.@* ;;; If path @code{/url}, then query value @code{q}. (define (urlskip-http-www-google-com uriobj) (cond ((equal? (urlskip-internal:rsegs/noslash uriobj) '("iclk" "pagead")) (cond ((uri-query-value uriobj "adurl") => urlskip) (else #f))) ((equal? (urlskip-internal:rsegs/noslash uriobj) '("url")) (cond ((uri-query-value uriobj "q") => urlskip) (else #f))) (else #f))) ;;; @defproc urlskip-http-www-qksrv-net uriobj ;;; ;;; UrlSkips @code{http://www.qksrv.net}:@* ;;; Query value @code{loc} or @code{url}. (define (urlskip-http-www-qksrv-net uriobj) ;; TODO: Check for path. (let ((q (uriobj-uriquery uriobj))) (cond ((or (uriquery-value q "loc") (uriquery-value q "url")) => urlskip) (else #f)))) ;;; @section Interface ;;; The only real library interface is the @code{urlskip} procedure. (define urlskip-internal:http-host-map ;; TODO: Maybe make this a user-mutable parameter. `(("ad.doubleclick.net" . ,urlskip-http-ad-doubleclick-net) ("click.linksynergy.com" . ,urlskip-http-click-linksynergy-com) ("rds.yahoo.com" . ,urlskip-http-rds-yahoo-com) ("service.netmeans.com" . ,urlskip-http-service-netmeans-com) ("web.ask.com" . ,urlskip-http-web-ask-com) ("www.amazon.com" . ,urlskip-http-www-amazon-com) ("www.anrdoezrs.net" . ,urlskip-http-www-anrdoezrs-net) ("www.commission-junction.com" . ,urlskip-http-www-commission-junction-com) ("www.google.com" . ,urlskip-http-www-google-com) ("www.qksrv.net" . ,urlskip-http-www-qksrv-net))) (define (urlskip-internal:from-uriobj uriobj) (cond ((and (eq? (uriobj-scheme uriobj) 'http) (assoc (uriserver-host (uriobj-uriserver uriobj)) urlskip-internal:http-host-map)) => (lambda (p) ((cdr p) uriobj))) (else #f))) ;;; @defproc urlskip uri ;;; ;;; Accepts a URL @var{uri} and yields a URL that is either @var{uri} or a ;;; UrlSkip simplified version of same. @var{uri} may be a string or a ;;; @i{uriobj}. If a simplified URL is yielded, it is always a string. (define (urlskip uri) (cond ((urlskip-internal:from-uriobj (uri->uriobj uri)) => uri->string) (else uri))) ;;; @section Tests ;;; The UrlSkip test suite can be enabled by editing the source code file and ;;; loading [Testeez]; the test suite is disabled by default. (define-syntax urlskip-internal:testeez (syntax-rules () ((_ x ...) ;; Note: Comment-out exactly one of the following two lines. (error "Tests disabled.") ;; (testeez x ...) ))) (define (urlskip-internal:test) (urlskip-internal:testeez "UrlSkip" (test/equal "Unhandled host" (urlskip "http://unhandled.foo/") "http://unhandled.foo/") (test/equal "Handled host, but not path" (urlskip "http://www.google.com/") "http://www.google.com/") (test/equal "Unhandled URI-scheme" (urlskip "ftp://ftp.gnu.org/") "ftp://ftp.gnu.org/") (test/equal "ad.doubleclick.net/click #1" (urlskip (string-append "http://ad.doubleclick.net/click;h=v3|31f3|0|0|%2a|o;111;0-0;0;222;333-4" "44|555;666|777|1;;~sscs=%3fhttp://www.esri.com/businessmap")) "http://www.esri.com/businessmap") (test/equal "ad.doubleclick.net/click #2" (urlskip (string-append "http://ad.doubleclick.net/click;h=v3|31f3|0|0|%2a|j;111;0-0;1;222;1-333" "|444;555|666|1;;~sscs=%3fhttp://www.sas.com/apps/sim/redirect.jsp?detai" "l=AAA")) "http://www.sas.com/apps/sim/redirect.jsp?detail=AAA") (test/equal "click.linksynergy.com/fs-bin/stat" (urlskip (string-append "http://click.linksynergy.com/fs-bin/stat?id=AAA&offerid=111&type=3&subi" "d=0&tmpid=222&rd_parm1=http%253a%252f%252fphobos.apple.com%252fwebobjec" "ts%252fmzstore.woa%252fwa%252fviewalbum%253fplaylistid%253d333%2526orig" "instorefront%253d444%26partnerid%3d555")) (string-append "http://phobos.apple.com/webobjects/mzstore.woa/wa/viewalbum?playlistid=3" "33&originstorefront=444&partnerid=555")) (test/equal "rds.yahoo.com" (urlskip (string-append "http://rds.yahoo.com/S=111/K=linux/v=2/SID=e/l=WS1/R=1/SS=222/MI=other/" "IPC=us/SHE=0/H=3/SIG=119s09tim/EXP=333/*-http%3A//www.linux.com/")) "http://www.linux.com/") (test/equal "service.netmeans.com/bfast/click" (urlskip (string-append "http://service.netmeans.com/bfast/click/?loc=http%3a//search.ebay.com/A" "AA-BBB_w0qqsokeywordredirectz1qqfromzr8&bfinfo=CCC")) "http://search.ebay.com/AAA-BBB_w0qqsokeywordredirectz1qqfromzr8") (test/equal "web.ask.com/redir" (urlskip (string-append "http://web.ask.com/redir?bpg=http%3a%2f%2fweb.ask.com%2fweb%3fq%3dsuck%" "26o%3d0%26page%3d1&q=suck&u=http%3a%2f%2ftm.wc.ask.com%2fr%3ft%3dan%26s" "%3da%26uid%3d2425c471e425c471e%26sid%3d3425c471e425c471e%26qid%3d000F7E" "296281F74AB34FE6E15C1CAE8F%26io%3d0%26sv%3dz6f5372cd%26o%3d0%26ask%3dsu" "ck%26uip%3d425c471e%26en%3dte%26eo%3d-100%26pt%3dSuck%253a%2bDaily%26ac" "%3d24%26qs%3d0%26pg%3d1%26ep%3d1%26te_par%3d106%26te_id%3d%26u%3dhttp%3" "a%2f%2fwww.suck.com%2f&s=a&bu=http%3a%2f%2fwww.suck.com%2f&qte=0&o=0&ab" "s=Save+us+a+leg.+Turkey+Tidings+From+Suck.com+(Click+thumbnails+below.)" "+%3a+Home+%7c+A.+Fast+'n'+easy+recipes+for+holiday+fun!++%7c+B.+Fowl+pl" "ay%3f!&tit=Suck%3a+Daily&bin=&cat=wp&purl=http%3a%2f%2ftm.wc.ask.com%2f" "i%2fb.html%3ft%3dan%26s%3da%26uid%3d2425c471e425c471e%26sid%3d3425c471e" "425c471e%26qid%3d000F7E296281F74AB34FE6E15C1CAE8F%26io%3d%26sv%3dz6f537" "2cd%26o%3d0%26ask%3dsuck%26uip%3d425c471e%26en%3dbm%26eo%3d-100%26pt%3d" "%26ac%3d24%26qs%3d0%26pg%3d1%26u%3dhttp%3a%2f%2fmyjeeves.ask.com%2facti" "on%2fsnip&Complete=1")) "http://www.suck.com/") (test/equal "www.amazon.com/exec/obidos/redirect" (urlskip (string-append "http://www.amazon.com/exec/obidos/redirect?tag=AAA&creative=111&camp=22" "2&link_code=bn1&path=asin/b333")) "http://www.amazon.com/exec/obidos/redirect?tag=AAA&path=asin/b333") (test/equal "www.anrdoezrs.net" (urlskip (string-append "http://www.anrdoezrs.net/click-111-222?url=http%3a%2f%2fhalf.ebay.com%2" "fproducts%2fbooks%2fdetail.cfm%3fisbn%3d333")) "http://half.ebay.com/products/books/detail.cfm?isbn=333") (test/equal "www.commission-junction.com/track/track.dll" (urlskip (string-append "http://www.commission-junction.com/track/track.dll?SID=&AID=111&PID=222" "&URL=http%3A%2F%2Fwww%2Ebooksamillion%2Ecom%2Fncom%2Fbooks%3Fisbn%3D333" "")) "http://www.booksamillion.com/ncom/books?isbn=333") (test/equal "www.google.com/pagead/iclk then service.netmeans/bfast/click" (urlskip (string-append "http://www.google.com/pagead/iclk?adurl=http://service.netmeans.com/bfa" "st/click/%3floc%3dhttp%253a//search.ebay.com/yamaha-remote_w0qqsokeywor" "dredirectz1qqfromzr8%26bfinfo%3da111&sa=l&ai=REALLYLONGNONSENSESTRING")) "http://search.ebay.com/yamaha-remote_w0qqsokeywordredirectz1qqfromzr8") (test/equal "www.google.com/url" (urlskip (string-append "http://www.google.com/url?sa=l&q=http://www.shopping.com/xGS-AAA_BBB~NS" "-1~linkin_id-111&ai=REALLYLONGNONSENSESTRING&num=3")) "http://www.shopping.com/xGS-AAA_BBB~NS-1~linkin_id-111") (test/equal "www.qksrv.net #1" (urlskip (string-append "http://www.qksrv.net/click-111-222?SID=333&url=http://dogbert.abebooks." "com/abe/BookDetails?bi=444")) "http://dogbert.abebooks.com/abe/BookDetails?bi=444") (test/equal "www.qksrv.net #2" (urlskip (string-append "http://www.qksrv.net/click-111-222?SID=&loc=http%3A//search.ebay.com/se" "arch/search.dll%3Fcgiurl%3Dhttp%253A%252F%252Fcgi.ebay.com%252Fws%252F%" "26krd%3D1%26from%3DR8%26MfcISAPICommand%3DGetResult%26ht%3D1%26SortProp" "erty%3DMetaEndSort%26query%3D333")) (string-append "http://search.ebay.com/search/search.dll?cgiurl=http%3A%2F%2Fcgi.ebay.co" "m%2Fws%2F&krd=1&from=R8&MfcISAPICommand=GetResult&ht=1&SortProperty=Meta" "EndSort&query=333")) )) ;; Unsupported: ;; ;; "http://www.amazon.de/exec/obidos/ASIN/0375432302/isbnnu-21/ref=nosim" ;; "http://www.amazon.de/exec/obidos/ASIN/0375432302/" ;; ;; "http://www.amazon.com/exec/obidos/tg/detail/-/B00000IWCU/davetaylor" ;; "http://www.amazon.com/exec/obidos/tg/detail/-/B00000IWCU/" ;; ;; "http://www.amazon.com/exec/obidos/redirect-home/thedentalpage-20" ;; "http://www.amazon.com/exec/obidos/redirect-home/" ;; ;; TODO: Very long "overture.com" URLs look like they can be decoded rather ;; than being keys... ;;; @unnumberedsec History ;;; @table @asis ;;; ;;; @item Version 0.1 --- 3 January 2005 ;;; Initial release. ;;; ;;; @end table ;;; @unnumberedsec References ;;; ;;; @table @asis ;;; ;;; @item [GPL] ;;; Free Software Foundation, ``GNU Lesser General Public License,'' Version 2, ;;; June 1991, 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.@* ;;; @uref{http://www.gnu.org/copyleft/gpl.html} ;;; ;;; @item [Testeez] ;;; Neil W. Van Dyke, ``Testeez: Simple Test Mechanism for Scheme,'' Version ;;; 0.1.@* ;;; @uref{http://www.neilvandyke.org/testeez/} ;;; ;;; @item [uri.scm] ;;; Neil W. Van Dyke, ``uri.scm: Web Uniform Resource Identifiers (URI) in ;;; Scheme,'' Version 0.1.@* ;;; @uref{http://www.neilvandyke.org/uri-scm/} ;;; ;;; @end table