Changes - j0ke.net Open Build Service

Changes of Revision 6

[-] [+]	Added	asn_get_routeviews.py
@@ -0,0 +1,176 @@ +#!/usr/bin/env python + +import os, os.path +import sys +import time +import urllib + +# the data snapshot that we need is put into monthly directories, like this: +# example url: 'http://archive.routeviews.org/oix-route-views/2008.11/oix-full-snapshot-latest.dat.bz2' + +filename = 'oix-full-snapshot-latest.dat.bz2' +url = 'http://archive.routeviews.org/oix-route-views/%s/%s' \ + % (time.strftime("%Y.%m", time.gmtime()), filename) + +if not os.path.exists(filename): + print >>sys.stderr, 'downloading', url + urllib.urlretrieve(url, filename=filename) + +if time.time() - os.path.getmtime(filename) > 60 * 60 * 24 * 7: + sys.exit('File older than 1 week - remove it to have it downloaded again') + + +def gen_open(filenames): + """Open a sequence of filenames""" + import gzip, bz2 + for name in filenames: + if name.endswith(".gz"): + yield gzip.open(name) + elif name.endswith(".bz2"): + yield bz2.BZ2File(name) + else: + yield open(name) + +def gen_cat(sources): + """Concatenate items from one or more + source into a single sequence of items""" + for s in sources: + for item in s: + yield item.rstrip() + +def gen_lines(lines): + """Some lines come broken in two lines, like this: + + * 63.105.200.0/21 203.181.248.168 0 0 0 7660 2516 703 9848 9957 i + * 63.105.202.0/27 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 9957 9286 i + * 63.105.204.128/25 + 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 i + * 63.105.205.0/25 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 i + * 63.105.207.144/28 + 203.62.252.186 0 0 0 1221 4637 4766 9318 9957 9957 9286 i + * 63.105.248.0/21 196.7.106.245 0 0 0 2905 701 19830 i + + This generator puts them together, and outputs them on one line. + """ + lastline = '' + for line in lines: + if len(line) > 35: + if lastline: + #print 'last:', lastline + #print 'line:', line + yield lastline + line + else: + yield line + lastline = '' + else: + lastline = line + + +def gen_grep(patc, lines): + """Generate a sequence of lines that contain + a given regular expression""" + for line in lines: + if '{' in line: + continue + if patc.search(line): yield line + +def gen_asn(lines): + """Generate a sequence of lines that end in 'i' + and return the first, third last and second word for each of them. + + Complain if a line doesn't end in 'i'. + + For prefix 0.0.0.0/0, we don't return AS number 286 - but rather zero, + because this is more meaningful later. An AS with number 0 doesn't exist. + 0.0.0.0/0 will be the prefix that contains 127.0.0.1. + + In routeviews data, 0.0.0.0/0 seems to be listed with a random (changing) + AS number, which seems like an artifact. + """ + for line in lines: + s = line.split() + if s[-1] != 'i': + print >>sys.stderr, repr(line) + sys.exit('Error: this is unusal, line ends in %r, not \'i\'' % s[-1]) + if s[1].startswith('0.0.0.0/0'): + # see comment above + yield s[1], '0', '0' + # drop the 'i' at the end + s.pop() + # drop doublettes of the as number at the end + while s[-1] == s[-2]: + s.pop() + yield s[1], s[-2], s[-1] + +# not used here, but useful another time maybe... +def gen_uniq(lines): + """Generate a sequence of lines that filters + lines that are identical to the line before""" + lastline = '' + for line in lines: + if line != lastline: + yield line + lastline = line + + +def gen_firstuniq(tupls): + """Generate a sequence of tuples that filters + tuples where the first word is the same as on the line above""" + last = '' + for tupl in tupls: + + if tupl[0] != last: + yield tupl + + last = tupl[0] + + +def main(): + """ + Create a generator pipeline and process 900 MB's worth of routeviews data. + + You can directly process the bz2 or gz compressed file. If you unpack it + before, it can be a few times faster, but the uncompressed data is nearly a + GB in size (2008). + + The output format is, for each line: + + prefix asnpeer asn + + Usage: get_routeviews [oix.dat[.gz\|.bz2]] + + Will read an existing file named 'oix-full-snapshot-latest.dat.bz2' if no + argument is given. + + If the file is older than 1 week, the script will suggest to download it + again. It'll automatically do so if you remove the file. + """ + import re + + pat = r'^\*' + patc = re.compile(pat) + + global filename + filename = [filename] + if len(sys.argv[1:]): + filename = [sys.argv[1]] + + try: + + oixfile = gen_open(filename) + oixlines = gen_cat(oixfile) + fixedlines = gen_lines(oixlines) + patlines = gen_grep(patc, fixedlines) + pfxasn = gen_asn(patlines) + pfxasn_uniq = gen_firstuniq(pfxasn) + + for pfx, asnb, asn in pfxasn_uniq: + print pfx, asnb, asn + + except KeyboardInterrupt: + sys.exit('interrupted!') + except IOError, e: + sys.exit(e) + +if __name__ == '__main__': + main()