Added the website hit statistics generator, produces data used by stattrans.pl

CVS version numbers get-www-stats: INITIAL -> 1.1
author: Marcin Owsiany <porridge> 2010-12-27 16:39:28 +0000
committer: Marcin Owsiany <porridge> 2010-12-27 16:39:28 +0000
commit: 9511385a1543fd0f67f2128208e6f082e76e27f0 (patch)
tree: db884b3dc06d888f47a6e511a7615b7a7400f2e6 /get-www-stats
parent: 57314027eb8a0d5e82f0871c12eac2b2a4eafec2 (diff)
1 files changed, 108 insertions, 0 deletions
diff --git a/get-www-stats b/get-www-stats
new file mode 100755
index 00000000000..2fb461eb23f
--- /dev/null
+++ b/get-www-stats
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+
+# get-www-stats - Debian web site popularity statistics
+# Copyright 2010 Marcin Owsiany <porridge@debian.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+# This program is run daily on a Debian website mirror like this:
+#
+#   get-www-stats > stats.txt-pending && mv stats.txt-pending stats.txt
+#
+# The output is then exported via DDE (see http://wiki.debian.org/DDE) and used
+# by the stattrans.pl script to sort the page lists in the Debian web site
+# translation statistics pages.
+
+try:
+  import json
+except ImportError:
+  import simplejson as json
+
+from gzip import open as gzopen
+import logging
+import os
+import re
+import sys
+
+#logging.basicConfig(level=logging.INFO)
+
+logs_dir = '/var/log/apache2'
+logs_prefix = 'www.debian.org-access.log'
+logs_count = 10
+
+logs = []
+for f in os.listdir(logs_dir):
+  if not f.startswith(logs_prefix):
+    continue
+  parts = f.split('-')
+  if len(parts) == 2:
+    logs.append((99999999, f, False))
+  elif len(parts) == 3:
+    if f.endswith('.gz'):
+      gzipped = True
+      stamp = parts[2][:-3]
+    else:
+      gzipped = False
+      stamp = parts[2]
+    logs.append((int(stamp), f, gzipped))
+  else:
+    logging.warn('Skipping unexpected filename [%s].' % f)
+
+counts = {}
+
+for n, f, gzipped in sorted(logs)[-logs_count:]:
+  logfile = os.path.join(logs_dir, f)
+  logging.info('Reading %s.' % logfile)
+  opener = gzipped and gzopen or open
+  for line in opener(logfile):
+    line = line.rstrip()
+    tokens = line.split()
+    url = tokens[6]
+    url = re.sub(r'\...\.html$', '', url)
+    url = re.sub(r'/$', '/index', url)
+    if url in counts:
+      counts[url] += 1
+    else:
+      counts[url] = 1
+  
+if '/index' not in counts:
+  raise Exception('No data for /index')
+elif counts['/index'] < 50000:
+  logging.warn('Less than 50k hits for /index')
+elif counts['/index'] < 10000:
+  raise Exception('Less than 10k hits for /index')
+
+json.dump(sorted([(v, k) for (k, v) in counts.iteritems() if v > 2], reverse=True),
+          sys.stdout,
+          indent=2)
+
+# for v, k in sorted([(v, k) for (k, v) in counts.iteritems()], reverse=True):
+#   print '%8d %s' % (v, k)
+#   if v < 3:
+#     break
+
+# Perl original:
+# @f=split;
+# $s = $f[6];
+# $s =~ s,\...\.html,,;
+# $s =~ s,/$,/index,;
+# $S{$s} += 1;
+# END{
+#   printf "%d normalized URLs\n", scalar keys %S;
+#   foreach my $k (sort { $S{$b} <=> $S{$a} } keys %S) {
+#     printf "%8d %s\n", $S{$k}, $k
+#   }
+# }
+
author	Marcin Owsiany <porridge>	2010-12-27 16:39:28 +0000
committer	Marcin Owsiany <porridge>	2010-12-27 16:39:28 +0000
commit	9511385a1543fd0f67f2128208e6f082e76e27f0 (patch)
tree	db884b3dc06d888f47a6e511a7615b7a7400f2e6 /get-www-stats
parent	57314027eb8a0d5e82f0871c12eac2b2a4eafec2 (diff)