@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+import os, os.path
+import sys
+import time
+import urllib
+
+# the data snapshot that we need is put into monthly directories, like this:
+# example url: 'http://archive.routeviews.org/oix-route-views/2008.11/oix-full-snapshot-latest.dat.bz2'
+
+filename = 'oix-full-snapshot-latest.dat.bz2'
+url = 'http://archive.routeviews.org/oix-route-views/%s/%s' \
+ % (time.strftime("%Y.%m", time.gmtime()), filename)
+
+if not os.path.exists(filename):
+ print >>sys.stderr, 'downloading', url
+ urllib.urlretrieve(url, filename=filename)
+
+if time.time() - os.path.getmtime(filename) > 60 * 60 * 24 * 7:
+ sys.exit('File older than 1 week - remove it to have it downloaded again')
+
+
+def gen_open(filenames):
+ """Open a sequence of filenames"""
+ import gzip, bz2
+ for name in filenames:
+ if name.endswith(".gz"):
+ yield gzip.open(name)
+ elif name.endswith(".bz2"):
+ yield bz2.BZ2File(name)
+ else:
+ yield open(name)
+
+def gen_cat(sources):
+ """Concatenate items from one or more
+ source into a single sequence of items"""
+ for s in sources:
+ for item in s:
+ yield item.rstrip()
+
+def gen_lines(lines):
+ """Some lines come broken in two lines, like this:
+
+ * 63.105.200.0/21 203.181.248.168 0 0 0 7660 2516 703 9848 9957 i
+ * 63.105.202.0/27 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 9957 9286 i
+ * 63.105.204.128/25
+ 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 i
+ * 63.105.205.0/25 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 i
+ * 63.105.207.144/28
+ 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 9957 9286 i
+ * 63.105.248.0/21 196.7.106.245 0 0 0 2905 701 19830 i
+
+ This generator puts them together, and outputs them on one line.
+ """
+ lastline = ''
+ for line in lines:
+ if len(line) > 35:
+ if lastline:
+ #print 'last:', lastline
+ #print 'line:', line
+ yield lastline + line
+ else:
+ yield line
+ lastline = ''
+ else:
+ lastline = line
+
+
+def gen_grep(patc, lines):
+ """Generate a sequence of lines that contain
+ a given regular expression"""
+ for line in lines:
+ if '{' in line:
+ continue
+ if patc.search(line): yield line
+
+def gen_asn(lines):
+ """Generate a sequence of lines that end in 'i'
+ and return the first, third last and second word for each of them.
+
+ Complain if a line doesn't end in 'i'.
+
+ For prefix 0.0.0.0/0, we don't return AS number 286 - but rather zero,
+ because this is more meaningful later. An AS with number 0 doesn't exist.
+ 0.0.0.0/0 will be the prefix that contains 127.0.0.1.
+
+ In routeviews data, 0.0.0.0/0 seems to be listed with a random (changing)
+ AS number, which seems like an artifact.
+ """
+ for line in lines:
+ s = line.split()
+ if s[-1] != 'i':
+ print >>sys.stderr, repr(line)
+ sys.exit('Error: this is unusal, line ends in %r, not \'i\'' % s[-1])
+ if s[1].startswith('0.0.0.0/0'):
+ # see comment above
+ yield s[1], '0', '0'
+ # drop the 'i' at the end
+ s.pop()
+ # drop doublettes of the as number at the end
+ while s[-1] == s[-2]:
+ s.pop()
+ yield s[1], s[-2], s[-1]
+
+# not used here, but useful another time maybe...
+def gen_uniq(lines):
+ """Generate a sequence of lines that filters
+ lines that are identical to the line before"""
+ lastline = ''
+ for line in lines:
+ if line != lastline:
+ yield line
+ lastline = line
+
+
+def gen_firstuniq(tupls):
+ """Generate a sequence of tuples that filters
+ tuples where the first word is the same as on the line above"""
+ last = ''
+ for tupl in tupls:
+
+ if tupl[0] != last:
+ yield tupl
+
+ last = tupl[0]
+
+
+def main():
+ """
+ Create a generator pipeline and process 900 MB's worth of routeviews data.
+
+ You can directly process the bz2 or gz compressed file. If you unpack it
+ before, it can be a few times faster, but the uncompressed data is nearly a
+ GB in size (2008).
+
+ The output format is, for each line:
+
+ prefix asnpeer asn
+
+ Usage: get_routeviews [oix.dat[.gz|.bz2]]
+
+ Will read an existing file named 'oix-full-snapshot-latest.dat.bz2' if no
+ argument is given.
+
+ If the file is older than 1 week, the script will suggest to download it
+ again. It'll automatically do so if you remove the file.
+ """
+ import re
+
+ pat = r'^\*'
+ patc = re.compile(pat)
+
+ global filename
+ filename = [filename]
+ if len(sys.argv[1:]):
+ filename = [sys.argv[1]]
+
+ try:
+
+ oixfile = gen_open(filename)
+ oixlines = gen_cat(oixfile)
+ fixedlines = gen_lines(oixlines)
+ patlines = gen_grep(patc, fixedlines)
+ pfxasn = gen_asn(patlines)
+ pfxasn_uniq = gen_firstuniq(pfxasn)
+
+ for pfx, asnb, asn in pfxasn_uniq:
+ print pfx, asnb, asn
+
+ except KeyboardInterrupt:
+ sys.exit('interrupted!')
+ except IOError, e:
+ sys.exit(e)
+
+if __name__ == '__main__':
+ main()
|