Ticket #34: analyze-traffic.pl

File analyze-traffic.pl, 6.6 KB (added by eagle, 12 years ago)

analyze-traffic

Line 
1#!/usr/bin/perl -w
2
3########################################################################
4#
5# analyze-traffic.pl
6#
7# Written by Jeffrey M. Vinocur <jeff@litech.org>
8# This work is hereby placed in the public domain by its author.
9#
10########################################################################
11#
12# Usage:
13#
14# 1.  Add an entry in $pathetc/newsfeeds like the one below, and issue
15#     `ctlinnd reload newsfeeds traffic` (you can change the path to
16#     whatever you like).
17#
18#             analyze!\
19#                 :*\
20#                 :Tf,WgsbmnN:/usr/local/news/log/traffic
21#
22#     You may find it useful to restrict the articles being logged,
23#     either by modifing the wildmat pattern, or by using the /exclude
24#     notation to indicate articles that have passed through some
25#     servers should not be included.
26#
27#     Also, if $USE_ALL_GROUPS (see below) is false, you can leave out
28#     the N flag, which may eliminate some parsing errors.
29#
30# 2.  Wait for some data.
31#
32# 3.  Run analyze-traffic.pl on the logged data (you can pass the
33#     filename as an argument or feed the data on standard input).
34#     You probably want to pipe it into a file, `less`, or `tail` as the
35#     output is a line for every group that has received an article
36#     according the input data.
37#
38#     There are some options hardcoded into the script below, under
39#     "Constants" -- check for customization, if you like.
40#
41# 4.  Be sure to comment out the newsfeeds entry when done, or set
42#     up some sort of log rotation, or INN will eventually fill up your
43#     disk...
44#
45########################################################################
46#
47# Implementation notes and known bugs:
48#
49# - We try (if $USE_ALL_GROUPS is set, below) to count crossposted
50#   towards each listed group (even ones not carried on the server!),
51#   but since some articles have funky Newsgroups headers, that can
52#   backfire.  So parsing can fail, which usually results in the
53#   relevant line being skipped, but occasionally can cause Perl to
54#   issue warnings (and perhaps produce funny things in the output).
55#
56#   A workaround would be to repeat e.g. the Message-ID at the end of
57#   the intput format (i.e. WgsbmnNm), and then the script could read as
58#   many lines as necessary until that ad hoc end-of-record marker
59#   appeared.  I haven't found a need for this yet, though.
60#
61# - The input format is a sequence of lines, each containing a number of
62#   space-separated fields.  Check newsfeeds(5) for what the semantics
63#   are, but an example line (wrapped), for reference, looks like:
64#
65#       rec.aviation.military [space]
66#       news-out.maxwell.syr.edu [space]
67#       2796 [space]
68#       <3Jvua.104184$My6.1642017@twister.tampabay.rr.com> [space]
69#       @030247454E45524C31000016AD3100000004@ [space]
70#       rec.aviation.military,rec.travel.usa-canada, [no space here]
71#           sci.electronics.design,sci.econ,sci.environment
72#
73# - The output format is a sequence of lines, one for each newsgroup,
74#   with three tab-separated fields.  They are sorted by either the
75#   second or third field, depending on $SORT_BY_SIZE, below.  The first
76#   field is the name of the newsgroup.  The second is the total number
77#   of articles appearing in that newsgroup followed by, in parentheses,
78#   the short name of the peer (see about $TLD_REGEX below) responsible
79#   for the most articles and the percentage it made up.  The third is
80#   the total number of kilobytes of (accepted) traffic in that
81#   newsgroup, followed similarly by the peer responsible for the most
82#   traffic in that group.  It looks something like this:
83#   
84#       news.lists.filters  1057 arts (63% syr)  7105.9 KB (36% cox)
85#
86#   The short names are made by taking the last component of the
87#   (dot-separated) peer name that doesn't match /$TLD_REGEX/.  The idea
88#   is that, for example, "isc.org" would be listed as "isc", and
89#   "demon.co.uk" would be listed as "demon".  Adjust $TLD_REGEX as
90#   needed to trim the top-level domains in your part of the world.
91#
92#   If your peers have very long short names, the output may look
93#   somewhat funny.  Similar things can happen with newsgroup names, so
94#   those longer than $FIELD1_WIDTH will be truncated to fit.  (You can
95#   set $FIELD1_WIDTH to '' to skip this truncation, in which case the
96#   first column will not be space-padded and the output will look a bit
97#   ragged.)
98#
99########################################################################
100#
101# Constants:
102
103my $USE_ALL_GROUPS = 1;  # if 0, use only group article is stored under
104my $SORT_BY_SIZE = 1;    # if 0, sort output by number of articles
105my $FIELD1_WIDTH = 30;   # maximum length of newsgroup name, '' for none
106
107my $TLD_REGEX = '^(?:com|net|org|edu|gov|mil|ac|co|uk|au|ca|de)$';
108                         # feel free to add any others as needed
109
110
111########################################################################
112
113use strict;
114
115my %stats;
116
117while( <> ) {
118    my ($group, $peer, $bytes, $id, $token, @Newsgroups) = split;
119    next unless ($USE_ALL_GROUPS ? @Newsgroups : $token);  # bad input line
120
121    my @groups = map { split /\s*,\s*/ } @Newsgroups;
122
123    foreach ($USE_ALL_GROUPS && @groups ? @groups : $group) {
124        my $s = $stats{$_} ||= { count => 0,
125                                 bytes => 0,
126                                 peers => {},
127                                 };
128
129        $s->{count}++;
130        $s->{bytes} += $bytes;
131        $s->{peers}->{$peer}->{count}++;
132        $s->{peers}->{$peer}->{bytes} += $bytes;
133    }
134}
135
136
137my $f = $SORT_BY_SIZE ? 'bytes' : 'count';
138
139foreach (sort { $stats{$a}->{$f} <=> $stats{$b}->{$f} } (keys %stats)) {
140    my %s = %{$stats{$_}};
141
142    my ($topcount,$topcountwho) = &max('count', $s{peers});
143    my ($topbytes,$topbyteswho) = &max('bytes', $s{peers});
144    $topcountwho = &trim($topcountwho);
145    $topbyteswho = &trim($topbyteswho);
146    my $countf = int(100 * $topcount / $s{count});
147    my $bytesf = int(100 * $topbytes / $s{bytes});
148    my $kb = 0.1 * int($s{bytes} * 10 / 1024);
149
150    my $ng = $FIELD1_WIDTH eq '' ? $_ : substr($_,0,$FIELD1_WIDTH);
151    print +(sprintf("%-${FIELD1_WIDTH}s\t", $ng)),
152          "$s{count} arts ($countf% $topcountwho)\t",
153          "${kb} KB ($bytesf% $topbyteswho)\n";
154}
155
1561;
157
158sub trim {
159    my @parts = split(/\./, $_[0]);
160    my $part;
161    while( defined($part = pop(@parts)) ) {
162        last unless $part =~ /$TLD_REGEX/o;
163    }
164    return defined($part) ? $part : $_[0];
165}
166
167sub max {
168    my $x = 0;
169    my $who;
170    my ($field, $listref) = @_;
171
172    while( my ($peer, $stats) = each %{$listref} ) {
173        if( $stats->{$field} > $x ) {
174            $x = $stats->{$field};
175            $who = $peer;
176        }
177    }
178
179    return ($x, $who);
180}