Icons of the Web

#!/usr/bin/env python import cgi import errno import math import md5 import os import re import sys import time import urllib2 import urlparse DATADIR = "find-data" LOCATIONS_DIR = "locations" DOMAINS_DIR = "domains" RESOLUTION = 600 X_OFFSET = 9360 Y_OFFSET = 9360 # The target screen size of icons automatically zoomed on. AUTOZOOM_SIZE = 400 class loc (object): def __init__(self, x, y, size): self.x = x self.y = y self.size = size class timer (object): def __init__(self): self.a = None self.b = None def start(self): self.a = time.clock() def end(self): self.b = time.clock() def elapsed(self): return self.b - self.a def fmt_float(f): return ("%.3f" % f).replace("-", "−"); def is_md5(s): if len(s) != 32: return False for c in s: if c not in "ABCDEF0123456789": return False return True def split_coords(s): try: x, y = s.split(",") x = float(x) y = float(y) except ValueError: return None else: return (x, y) def get_favicon_url(data): for link in re.finditer(r'<\s*link\s+([^>]*)>', data, re.I): attrs = link.groups()[0] rel = None href = None for m in re.finditer(r'(\w+)\s*=\s*["\'](.*?)["\']', attrs, re.I): name, value = m.groups() if name.lower() == "rel": rel = value if name.lower() == "href": href = value if rel and "icon" in rel.lower().split() and href: return href return "/favicon.ico" def print_loc(prefix, loc): if loc is None: print "

%s: Not found.

" % cgi.escape(prefix) else: x_f = float(loc.x + X_OFFSET) / RESOLUTION y_f = float(Y_OFFSET - loc.y) / RESOLUTION print "

%s: The icon is at (%s, %s) and is %d × %d pixels.

" % (cgi.escape(prefix), fmt_float(x_f), fmt_float(y_f), loc.size, loc.size) def lookup(base, hash): result = None filename = os.path.join(DATADIR, base, LOCATIONS_DIR, hash[0:2], hash[2:4]) try: f = open(filename, "r") except IOError, e: if e.errno == errno.ENOENT: return None else: raise else: for line in f: h, l, t, s = line.split() if h == hash: l = int(l) t = int(t) s = int(s) result = loc(l + s / 2, t + s / 2, s * 2) break f.close() return result def lookup_domain_single(domain, filename): f = open(filename, "r") try: for line in f: host, hash = line.strip().split("\t") if host == domain: return hash finally: f.close() def lookup_domain_aux(domain, dir, remaining): if len(remaining) == 0: return lookup_domain_single(domain, os.path.join(dir, "term")) else: k, remaining = remaining[0], remaining[1:] filename = os.path.join(dir, k) if os.path.isdir(filename): return lookup_domain_aux(domain, filename, remaining) else: return lookup_domain_single(domain, filename) def collapse_domain(domain): r = [] for c in domain.lower(): if c.isalnum(): r.append(c) return "".join(r) def lookup_domain(base, domain): remaining = collapse_domain(domain) return lookup_domain_aux(domain, os.path.join(DATADIR, base, DOMAINS_DIR), remaining) def handle_md5(hash): alexa_loc = lookup("alexa", hash) print_loc("MD5 lookup", alexa_loc) return alexa_loc def handle_coords(coords): x, y = coords x = int(x * RESOLUTION + 0.5) - X_OFFSET y = Y_OFFSET - int(y * RESOLUTION + 0.5) return loc(x, y, 16) def get_url(url): t = timer() print "

" print "%s" % cgi.escape(url) sys.stdout.flush() t.start() try: root = urllib2.urlopen(url, None) except Exception, e: print " failed: %s." % cgi.escape(str(e)) print "

" return None, None if root.geturl() != url: print " → %s" % cgi.escape(root.geturl()) sys.stdout.flush() data = root.read(10000) t.end() print " %d bytes in %.2f seconds.
" % (len(data), t.elapsed()) sys.stdout.flush() favicon_url = urlparse.urljoin(root.geturl(), get_favicon_url(data)) print "%s" % cgi.escape(favicon_url) sys.stdout.flush() parts = urlparse.urlparse(favicon_url) if not (parts[0] == "http" or parts[0] == "https"): print " only http and https URLs are supported." print "

" return None, None t.start() try: favicon = urllib2.urlopen(favicon_url, None) except Exception, e: print " failed: %s." % cgi.escape(str(e)) print "

" return None, None if favicon.geturl() != favicon_url: print " → %s" % cgi.escape(favicon.geturl()) sys.stdout.flush() favicon_data = favicon.read(20000) t.end() print " %d bytes in %.2f seconds.
" % (len(favicon_data), t.elapsed()) print "

" sys.stdout.flush() m = md5.new() m.update(favicon_data) hash = m.hexdigest().upper() return hash, favicon.geturl() def handle_url(url): parts = urlparse.urlparse(url) if not parts[0]: url = "http://" + url parts = urlparse.urlparse(url) if not (parts[0] == "http" or parts[0] == "https"): print "

%s only http and https URLs are supported.

" % cgi.escape(url) return url = urlparse.urlunparse(parts) alexa_loc = None hash, url = get_url(url) if hash: alexa_loc = lookup("alexa", hash) print "

" % \ cgi.escape(url, True) print_loc("Online lookup", alexa_loc) if not alexa_loc: domain = parts[1] hash = lookup_domain("alexa", domain) if not hash and domain.startswith("www."): hash = lookup_domain("alexa", domain[4:]) if hash: alexa_loc = lookup("alexa", hash) else: alexa_loc = None print_loc("Survey database lookup", alexa_loc) if not alexa_loc: print """\

Why not found? See the FAQ for more information.

""" return alexa_loc def handle_query(q): coords = split_coords(q) if is_md5(q.upper()): return handle_md5(q.upper()) elif coords: return handle_coords(coords) else: return handle_url(q) def box(n, low, high): if n < low: return low elif n > high: return high else: return n def zoom_for_size(s): """Return a good zoom level for an icon of the given size.""" return box(int(math.log(float(s) / AUTOZOOM_SIZE, 2) + 0.5), 0, 6) form = cgi.FieldStorage() q = form.getfirst("q") print "Content-type: text/html\r" print "Connection: close\r" print "\r" print """\ Icons of the Web

Icons of the Web

A large-scale scan of the top million web sites (per Alexa traffic data) was performed in early 2010 using the Nmap Security Scanner and its scripting engine. As seen in the New York Times, Slashdot, Gizmodo, Engadget, and Telegraph.co.uk ...

We retrieved each site's icon by first parsing the HTML for a link tag and then falling back to /favicon.ico if that failed. 328,427 unique icons were collected, of which 288,945 were proper images. The remaining 39,482 were error strings and other non-image files. Our original goal was just to improve our http-favicon.nse script, but we had enough fun browsing so many icons that we used them to create the visualization below.

The area of each icon is proportional to the sum of the reach of all sites using that icon. When both a bare domain name and its "www." counterpart used the same icon, only one of them was counted. The smallest icons--those corresponding to sites with approximately 0.0001% reach--are scaled to 16x16 pixels. The largest icon (Google) is 11,936 x 11,936 pixels, and the whole diagram is 37,440 x 37,440 (1.4 gigapixels). Since your web browser would choke on that, we have created the interactive viewer below (click and drag to pan, double-click to zoom, or type in a site name to go right to it).

""" sys.stdout.flush() alexa_loc = None if q: alexa_loc = handle_query(q) print """\ """ % (cgi.escape(q and q or "", True)) print """\

+ −

""" print """\ """ print """\

Printed Poster

The graphic has been made into a 24x36 inch poster. Click to see a larger version.

We have only printed 15 posters (for Nmap developers) so far, but we're considering an offset print run if there is enough demand. If you might be interested in buying a physical copy of the poster, please fill out this short form:

Email address (used only to contact you about posters)

Country (for shipping cost estimate)

For downloads of programs and data files, go to this page.

FAQ

Why are some sites not found?: There are a few possible causes. First, the site may not have been among the top million at the time the survey was done. Check the data file to see if it was present. Second, the site may have changed its icon since the survey was done. This page downloads the current icon of the site you type in, and looks up its hash in a database. Failing that, it will look up the site name in the database, but that only works if you use the exact same name we did when doing the survey. Third, it's possible that the site timed out or didn't have an icon at the time of the survey. Fourth, this page limits the size of the icons it will download. If an icon file is too big, it won't be found. Calculate the MD5 sum of the icon yourself and enter it in the search box.
Why are some icons (Amazon, Bing, Baidu) so small?: This usually indicates that the main site timed out during the survey, and only less popular sites using the same icon responded. In other words, it represents a data collection error. For example, baidu.com didn't respond, but baidu.hk and baidu.jp did, and so what would have been one of the biggest icons is instead small. See this page for more technical details and caveats. We didn't fudge the data after the survey or attempt to fill in any obviously "missing" icons.
Why are there two "Я" Yandex icons?: Look closely. The icons are different. The uniqueness of icons is based on their MD5 hash, so even icons that are visually identical may in fact be different. Remember, the original impetus for this scan was to improve the hash database of an Nmap Scripting Engine script.

Credits

Programming and design was done by David Fifield and scanning performed by Brandon Enright.

Nmap Site Navigation

""" sys.stdout.flush()