| 
       1                 : /** @file
       2                 :  * @author Enrico Zini <enrico@enricozini.org>
       3                 :  * Correlate popcon data with local popcon information
       4                 :  */
       5                 : 
       6                 : /*
       7                 :  * Copyright (C) 2007  Enrico Zini <enrico@debian.org>
       8                 :  *
       9                 :  * This program is free software; you can redistribute it and/or modify
      10                 :  * it under the terms of the GNU General Public License as published by
      11                 :  * the Free Software Foundation; either version 2 of the License, or
      12                 :  * (at your option) any later version.
      13                 :  *
      14                 :  * This program is distributed in the hope that it will be useful,
      15                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      16                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17                 :  * GNU General Public License for more details.
      18                 :  *
      19                 :  * You should have received a copy of the GNU General Public License
      20                 :  * along with this program; if not, write to the Free Software
      21                 :  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
      22                 :  */
      23                 : 
      24                 : #include <ept/popcon/local.h>
      25                 : #include <ept/popcon/popcon.h>
      26                 : #include <ept/popcon/maint/path.h>
      27                 : 
      28                 : #include <wibble/exception.h>
      29                 : 
      30                 : #include <algorithm>
      31                 : #include <fstream>
      32                 : #include <cmath>
      33                 : 
      34                 : //#include <iostream>
      35                 : 
      36                 : using namespace std;
      37                 : 
      38                 : namespace ept {
      39                 : namespace popcon {
      40                 : 
      41                 : // Split a string where there are separators
      42            1903 : static vector<string> split(const std::string& str, char sep = ' ')
      43                 : {
      44            1903 :         vector<string> res;
      45            1903 :         size_t start = 0;
      46           10435 :         while (start < str.size())
      47                 :         {
      48            8531 :                 size_t end = str.find(sep, start);
      49            8531 :                 if (end == string::npos)
      50                 :                 {
      51            1902 :                         res.push_back(str.substr(start));
      52            1902 :                         break;
      53                 :                 }
      54                 :                 else
      55                 :                 {
      56            6629 :                         res.push_back(str.substr(start, end-start));
      57            6629 :                         start = end + 1;
      58                 :                 }
      59                 :         }
      60               0 :         return res;
      61                 : }
      62                 : 
      63                 : // Reverse sort pairs by comparing their second element
      64                 : struct secondsort
      65                 : {
      66               0 :         bool operator()(const pair<string, float>& a, const pair<string, float>& b) const
      67                 :         {
      68               0 :                 if (a.second == b.second)
      69               0 :                         return a.first > b.first;
      70                 :                 else
      71               0 :                         return a.second > b.second;
      72                 :         }
      73                 : };
      74                 : 
      75               1 : Local::Local(const std::string& file)
      76                 : {
      77               1 :         m_timestamp = Path::timestamp(file);
      78               1 :         if (m_timestamp == 0)
      79               0 :                 return;
      80                 :         
      81               1 :         ifstream in;
      82               1 :         in.open(file.c_str());
      83               1 :         if (!in.good())
      84               0 :                 throw wibble::exception::File(file, "opening file for reading");
      85                 : 
      86            1907 :         while (!in.eof())
      87                 :         {
      88            1905 :                 std::string line;
      89            1905 :                 getline(in, line);
      90            1905 :                 if (line.substr(0, 10) == "POPULARITY")
      91               3 :                         continue;
      92            1904 :                 if (line.substr(0, 14) == "END-POPULARITY")
      93                 :                         continue;
      94            1903 :                 vector<string> data = split(line);
      95            1903 :                 if (data.size() < 4)
      96                 :                         continue;
      97            1902 :                 if (data[3] == "<NOFILES>")
      98                 :                         // This is an empty / virtual package
      99             979 :                         m_scores.insert(make_pair(data[2], 0.1));
     100             923 :                 else if (data.size() == 4)
     101                 :                         // Package normally in use
     102               0 :                         m_scores.insert(make_pair(data[2], 1.0));
     103             923 :                 else if (data[4] == "<OLD>")
     104                 :                         // Unused packages
     105             745 :                         m_scores.insert(make_pair(data[2], 0.3));
     106             178 :                 else if (data[4] == "<RECENT-CTIME>")
     107                 :                         // Recently installed packages
     108             178 :                         m_scores.insert(make_pair(data[2], 0.5));
     109               1 :         }
     110               0 : }
     111                 : 
     112               2 : float Local::score(const std::string& pkg) const
     113                 : {
     114               2 :         std::map<std::string, float>::const_iterator i = m_scores.find(pkg);
     115               2 :         if (i == m_scores.end())
     116               0 :                 return 0;
     117                 :         else
     118               2 :                 return i->second;
     119                 : }
     120                 : 
     121                 : /**
     122                 :  * Return the TFIDF score of the package computed against the popcon
     123                 :  * information.
     124                 :  */
     125               1 : float Local::tfidf(const Popcon& popcon, const std::string& pkg) const
     126                 : {
     127               1 :         float popconScore = popcon.score(pkg);
     128                 :         //cerr << pkg << ": " << score(pkg) << " * log(" << (float)popcon.submissions() << " / " << popconScore << ") = " << score(pkg) * log((float)popcon.submissions() / popconScore) << endl;
     129               1 :         if (popconScore == 0)
     130               0 :                 return 0;
     131                 :         else
     132               1 :                 return score(pkg) * log((float)popcon.submissions() / popconScore);
     133                 :         
     134                 : }
     135                 : 
     136               0 : std::vector< std::pair<std::string, float> > Local::scores() const
     137                 : {
     138               0 :         vector< pair<string, float> > res;
     139                 :         // Copy the scores in res
     140               0 :         copy(m_scores.begin(), m_scores.end(), back_inserter(res));
     141                 :         // Sort res by score
     142               0 :         sort(res.begin(), res.end(), secondsort());
     143               0 :         return res;
     144                 : }
     145                 : 
     146               0 : std::vector< std::pair<std::string, float> > Local::tfidf(const Popcon& popcon) const
     147                 : {
     148               0 :         vector< pair<string, float> > res;
     149                 :         // Compute the tfidf scores and store them into res
     150               0 :         for (std::map<std::string, float>::const_iterator i = m_scores.begin();
     151                 :                         i != m_scores.end(); ++i)
     152                 :         {
     153               0 :                 float popconScore = popcon.score(i->first);
     154               0 :                 if (popconScore == 0)
     155               0 :                         res.push_back(make_pair(i->first, 0.0f));
     156                 :                 else
     157                 :                         res.push_back(make_pair(i->first,
     158               0 :                                                 i->second * log((float)popcon.submissions() / popconScore)));
     159                 :         }
     160                 :         // Sort res by score
     161               0 :         sort(res.begin(), res.end(), secondsort());
     162               0 :         return res;
     163                 : }
     164                 : 
     165                 : }
     166               6 : }
     167                 : 
     168                 : // vim:set ts=4 sw=4:
 |