liblinear用の特徴量選択 - kensuke-miの日記

以前，線形分類器であるliblinear向けにグリッド探索をする話を紹介したと思う．

今度は，liblinear向けに特徴量を選択したくなった．元々，libsvmには特徴量選択スクリプトが存在している．

じゃあ，これをちょっと改造すればいいんじゃね？という安易な発想から始まる．

まず，このスクリプトfselect.pyを入手．

このスクリプトは，どうやら内部でgrid.pyを呼び出しているようだ．それにヘッダの部分にgrid.pyの場所と分類コマンド(たぶん，学習に使うカーネルを指定しているんだろう)を記述する箇所がある．なので，これをliblinear用に書き換える．これはliblinear向けにグリッド探索をする話で既出なので，そっちを確認してほしい．

さて，これでliblinear向けに特徴量選択を行ってくれるスクリプトが完成した．で，fselect.pyというのは，実行すると，.fscoreと.selectを生成するようだ．

.fscoreは各特徴量のスコア .selectは選択後の情報

と解釈すればいいだろう．

さて，ここで選択後の特徴量が得られたわけだが，せっかくなので，全自動で訓練ファイルの書き換えもやってもらいたいところ．なので，次の流れのスクリプトを書く．

改造済みのfselect.pyを実行する
↓
.selectファイルから選択後の特徴量のみを抜き出す
↓
はじめの訓練ファイルを書き換えて，選択後の特徴量のみにする
↓
書き出す

これをpythonコードで書いた

#! /usr/bin/python
# -*- coding:utf-8 -*-
__author='Kensuke Mitsuzawa'
__date__='2014/1/21'

"""
libsvm toolのfselect.pyを使って，有意な特徴量のみを取り出すスクリプト
usage:python f_based_feature_selection.py path_to_trainfile
RETURN:有意な特徴量だけで構成されたtrainファイル
"""

import os,codecs,re,sys,subprocess

def excute_fselect(trainfile_path):
    """
    fselect.pyを実行して，F値をベースに素性選択をさせる．
    RETURN:NONE
    """
    cwd='./'
    args=['python', 'fselect.py', trainfile_path]
    subproc_args = {'stdin': subprocess.PIPE,
                    'stdout': subprocess.PIPE,
                    'stderr': subprocess.STDOUT,
                    'cwd': cwd,
                    'close_fds' : True,}
    try:
        p=subprocess.Popen(args, shell=False, **subproc_args);
    
    except OSError:
        print "Failed to execute command: %s" % args[0];
        sys.exit(1);

    (stdouterr, stdin) = (p.stdout, p.stdin)
    print "-" * 80
    while True:
        line = stdouterr.readline()
        if not line:
            break
        print line.rstrip()

    return True;

def check_selected_feat(trainfile_path):
    """
    選択された素性番号のみをファイルから読み込み
    RETURN: list selected_features[ float feature_number ]
    """
    selecetfile=os.path.basename(trainfile_path)+'.select';
    with codecs.open(selecetfile, 'r', 'utf-8') as lines:
        for line in lines:
            if re.search(ur'select\sfeatures\:\s\[.+\]', line):
                processed_line=line.strip().strip(u'select features: [').strip(u']');
                selected_features=[float(t) for t in processed_line.split(u',')];
    return selected_features;

def preprocess_original_inputfile(trainfile_path):
    """
    元の訓練ファイルを構造化してデータにしておく
    RETURN: list original_data [ tuple one_instance (unicode label,list feature_and_value [float feature_number,float feature_value])]
    """
    train_data=[];
    with codecs.open(trainfile_path, 'r', 'utf-8') as lines:
        for line in lines:
            elements=line.split();
            label=elements[0];
            features=[[float(e) for e in  f.split(u':')] for f in elements[1:]] 
            train_data.append((label,features));
    return train_data;

def cutoff_unwanted_features(train_data,selected_features):
    after_data=[];
    for one_instance in train_data:
        label=one_instance[0];
        features=[];
        for feature_tuple in one_instance[1]:
            if feature_tuple[0] in selected_features:
                features.append(feature_tuple); 
        after_data.append([label, features]) 
   
    return after_data

def out_after_data(after_data,trainfile_path):
    outpath=os.path.basename(trainfile_path)+'.after'
    with codecs.open(outpath, 'w', 'utf-8') as f:
        for one_instance in after_data:
            #特徴量と特徴量の値を文字型に直して，libsvm formatに書き換える作業
            features=u' '.join([str(f_tuple[0]).strip(u'.0')+u':'+str(f_tuple[1]) for f_tuple in one_instance[1]])
            label=one_instance[0];
            
            f.write(label+u' '+features+u'\n');

if __name__=='__main__':
    trainfile_path=sys.argv[1];
    excute_fselect(trainfile_path);
    selected_features=check_selected_feat(trainfile_path);
    train_data=preprocess_original_inputfile(trainfile_path);
    after_data=cutoff_unwanted_features(train_data,selected_features);
    out_after_data(after_data,trainfile_path)

それと，このコードはぼくが書き換えたfselect.pyが同じディレクトリにないといけないので，それも貼っておく．このコードはライセンス情報が一切かいていなかったので，改変公開してもいいのかなあ．．．とちょっと心配したが，まっいっか．

#!/usr/bin/env python

import random
from random import randrange
import sys
from time import time
from datetime import datetime
#import string
#from string import *
import os
from os import system
from os import unlink
from subprocess import *

##### Path Setting #####

is_win32 = (sys.platform == 'win32')
if not is_win32:
        #original gridpy_exe is below
        #gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
        #To use with liblinear
        gridpy_exe = "./grid.py -log2c -3,0,1 -log2g null"
        svmtrain_exe="../svm-train"
        svmpredict_exe="../svm-predict"
else:
        gridpy_exe = r".\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
        svmtrain_exe=r"..\windows\svmtrain.exe"
        svmpredict_exe=r"..\windows\svmpredict.exe"

##### Global Variables #####

train_pathfile=""
train_file=""
test_pathfile=""
test_file=""
if_predict_all=0

whole_fsc_dict={}
whole_imp_v=[]


def arg_process():
        global train_pathfile, test_pathfile
        global train_file, test_file
        global svmtrain_exe, svmpredict_exe

        if len(sys.argv) not in [2,3]:
                print('Usage: %s training_file [testing_file]' % sys.argv[0])
                raise SystemExit

        train_pathfile=sys.argv[1]
        assert os.path.exists(train_pathfile),"training file not found"
        train_file = os.path.split(train_pathfile)[1]

        if len(sys.argv) == 3:
                test_pathfile=sys.argv[2]
                assert os.path.exists(test_pathfile),"testing file not found"
                test_file = os.path.split(test_pathfile)[1]


##### Decide sizes of selected feautures #####

def feat_num_try_half(max_index):
        v=[]
        while max_index > 1:
                v.append(max_index)
                max_index //= 2
        return v

def feat_num_try(f_tuple):
        for i in range(len(f_tuple)):
                if f_tuple[i][1] < 1e-20:
                        i=i-1; break
        #only take first eight numbers (>1%)
        return feat_num_try_half(i+1)[:8]


def random_shuffle(label, sample):
        random.seed(1)  # so that result is the same every time
        size = len(label)
        for i in range(size):
                ri = randrange(0, size-i)
                tmp = label[ri]
                label[ri] = label[size-i-1]
                label[size-i-1] = tmp
                tmp = sample[ri]
                sample[ri] = sample[size-i-1]
                sample[size-i-1] = tmp



### compare function used in list.sort(): sort by element[1]
#def value_cmpf(x,y):
#       if x[1]>y[1]: return -1
#       if x[1]<y[1]: return 1
#       return 0
def value_cmpf(x):
        return (-x[1]);

### cal importance of features
### return fscore_dict and feat with desc order
def cal_feat_imp(label,sample):

        print("calculating fsc...")

        score_dict=cal_Fscore(label,sample)

        score_tuples = list(score_dict.items())
        score_tuples.sort(key = value_cmpf)

        feat_v = score_tuples
        for i in range(len(feat_v)): feat_v[i]=score_tuples[i][0]

        print("fsc done")
        return score_dict,feat_v


### select features and return new data
def select(sample, feat_v):
        new_samp = []

        feat_v.sort()

        #for each sample
        for s in sample:
                point={}
                #for each feature to select
                for f in feat_v:
                        if f in s: point[f]=s[f]

                new_samp.append(point)

        return new_samp


### Do parameter searching (grid.py) 
def train_svm(tr_file):
        cmd = "%s %s" % (gridpy_exe,tr_file)
        print(cmd)
        print('Cross validation...')
        std_out = Popen(cmd, shell = True, stdout = PIPE).stdout

        line = ''
        while 1:
                last_line = line
                line = std_out.readline()
                if not line: break
                
        if len(last_line.split())==2:
            print 'This is liblinear mode'
            g = 0;
            c,rate = map(float,last_line.split())
        
        elif len(last_line.split())==3:
            c,g,rate = map(float,last_line.split())

        print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))

        return c,g,rate

### Given (C,g) and training/testing data,
### return predicted labels
def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
        global train_file
        tr_file = train_file+".tr"
        te_file = train_file+".te"
        if model_name:  model_file = model_name
        else:  model_file = "%s.model"%tr_file
        out_file = "%s.o"%te_file
        
        # train
        writedata(tr_sample,tr_label,tr_file)
        cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
        os.system(cmd) 

        # test
        writedata(test_sample,test_label,te_file)
        cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
        print(cmd)
        os.system(cmd)
        
        # fill in pred_y
        pred_y=[]
        fp = open(out_file)
        line = fp.readline()
        while line:
                pred_y.append( float(line) )
                line = fp.readline()
        
        rem_file(tr_file)
        #rem_file("%s.out"%tr_file)
        #rem_file("%s.png"%tr_file)
        rem_file(te_file)
        if del_model: rem_file(model_file)
        fp.close()
        rem_file(out_file)
        
        return pred_y


def cal_acc(pred_y, real_y):
        right = 0.0

        for i in range(len(pred_y)):
                if(pred_y[i] == real_y[i]): right += 1

        print("ACC: %d/%d"%(right, len(pred_y)))
        return right/len(pred_y)

### balanced accuracy
def cal_bacc(pred_y, real_y):
        p_right = 0.0
        n_right = 0.0
        p_num = 0
        n_num = 0

        size=len(pred_y)
        for i in range(size):
                if real_y[i] == 1:
                        p_num+=1
                        if real_y[i]==pred_y[i]: p_right+=1
                else:
                        n_num+=1
                        if real_y[i]==pred_y[i]: n_right+=1

        print([p_right,p_num,n_right,n_num])
        writelog("       p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
        if p_num==0: p_num=1
        if n_num==0: n_num=1
        return 0.5*( p_right/p_num + n_right/n_num )


##### Log related #####
def initlog(name):
        global logname
        logname = name
        logfile_fd = open(logname, 'w')
        logfile_fd.close()


VERBOSE_MAX=100
VERBOSE_ITER = 3
VERBOSE_GRID_TIME = 2
VERBOSE_TIME = 1

def writelog(str, vlevel=VERBOSE_MAX):
        global logname
        if vlevel > VERBOSE_ITER:
                logfile_fd = open(logname, 'a')
                logfile_fd.write(str)
                logfile_fd.close()


def rem_file(filename):
        #system("rm -f %s"%filename)
        try:
            unlink(filename)
        except OSError:
            print '.png file does not exist. However...no problem! \(o_o)/'

##### MAIN FUNCTION #####
def main():
        global train_pathfile, train_file
        global test_pathfile, test_file
        global whole_fsc_dict,whole_imp_v

        times=5 #number of hold-out times
        accuracy=[]

        ### Read Data
        print("reading....")
        t=time()
        train_label, train_sample, max_index = readdata(train_pathfile)
        t=time()-t
        writelog("loading data '%s': %.1f sec.\n"%(train_pathfile,t), VERBOSE_TIME)
        print("read done")

        ### Randomly shuffle data
        random_shuffle(train_label, train_sample)


        ###calculate f-score of whole training data
        #whole_imp_v contains feat with order
        t=time()
        whole_fsc_dict,whole_imp_v = cal_feat_imp(train_label,train_sample)
        t=time()-t
        writelog("cal f-score time: %.1f\n"%t, VERBOSE_TIME)

        ###write (sorted) f-score list in another file
        f_tuples = list(whole_fsc_dict.items())
        f_tuples.sort(key = value_cmpf)
        fd = open("%s.fscore"%train_file, 'w')
        for t in f_tuples:
                fd.write("%d: \t%.6f\n"%t)
        fd.close()


        ### decide sizes of features to try
        fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]
        for i in range(len(fnum_v)):
                accuracy.append([])
        writelog("try feature sizes: %s\n\n"%(fnum_v))


        writelog("%#Feat\test. acc.\n")

        est_acc=[]
        #for each possible feature subset
        for j in range(len(fnum_v)):

                fn = fnum_v[j]  # fn is the number of features selected
                fv = whole_imp_v[:fn] # fv is indices of selected features

                t=time()
                #pick features
                tr_sel_samp = select(train_sample, fv)
                tr_sel_name = train_file+".tr"
                t=time()-t
                writelog("\n   feature num: %d\n"%fn, VERBOSE_ITER)
                writelog("      pick time: %.1f\n"%t, VERBOSE_TIME)

                t=time()
                writedata(tr_sel_samp,train_label,tr_sel_name)
                t=time()-t
                writelog("      write data time: %.1f\n"%t, VERBOSE_TIME)


                t=time()
                # choose best c, gamma from splitted training sample
                c,g, cv_acc = train_svm(tr_sel_name)
                t=time()-t
                writelog("      choosing c,g time: %.1f\n"%t, VERBOSE_GRID_TIME)

                est_acc.append(cv_acc)
                writelog("%d:\t%.5f\n"%(fnum_v[j],cv_acc) )

        print(fnum_v)
        print(est_acc)

        fnum=fnum_v[est_acc.index(max(est_acc))]
#       print(est_acc.index(max(est_acc)))
        print('Number of selected features %s' % fnum)
        print('Please see %s.select for details' % train_file)

        #result for features selected
        sel_fv = whole_imp_v[:fnum]

        writelog("max validation accuarcy: %.6f\n"%max(est_acc))
        writelog("\nselect features: %s\n"%sel_fv)
        writelog("%s features\n"%fnum)
                

        # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
        rem_file(tr_sel_name)
        rem_file("%s.out"%tr_sel_name)
        rem_file("%s.png"%tr_sel_name)


        ### do testing 

        test_label=None
        test_sample=None
        if test_pathfile != "":
                print("reading testing data....")
                test_label, test_sample, max_index = readdata(test_pathfile)
                writelog("\nloading testing data '%s'\n"%test_pathfile)
                print("read done")
                
                #picking features
                train_sel_samp = select(train_sample, sel_fv)
                test_sel_samp = select(test_sample, sel_fv)

                #grid search
                train_sel_name = "%s.%d"%(train_file,fnum)
                writedata(train_sel_samp,train_label,train_sel_name)
                c,g, cv_acc = train_svm(train_sel_name)
                writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))

                # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
                rem_file(train_sel_name)




                ### predict
                pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp, 0, "%s.model"%train_sel_name)

                #calculate accuracy
                acc = cal_acc(pred_y, test_label)
                ##acc = cal_bacc(pred_y, test_label)
                writelog("testing accuracy = %.6f\n"%acc)

                #writing predict labels
                out_name = "%s.%d.pred"%(test_file,fnum)
                fd = open(out_name, 'w')
                for y in pred_y: fd.write("%f\n"%y)
                fd.close()
                

### predict all possible sets ###
def predict_all():

        global train_pathfile, train_file
        global test_pathfile, test_file

        global whole_fsc_dict,whole_imp_v

        train_label, train_sample, max_index = readdata(train_pathfile)
        test_label, test_sample, m = readdata(test_pathfile)

        random_shuffle(train_label, train_sample)

        ###whole_fsc_dict, ordered_feats = cal_feat_imp(train_label,train_sample)
        ordered_feats = whole_imp_v
        f_tuples = whole_fsc_dict.items()
        f_tuples.sort(key = value_cmpf)

        fnum_v = feat_num_try(f_tuples) #ex: [50,25,12,6,3,1]

        writelog("\nTest All %s\n"%fnum_v)
        for fnum in fnum_v:
                sel_fv = ordered_feats[:fnum]

                #picking features
                train_sel_samp = select(train_sample, sel_fv)
                test_sel_samp = select(test_sample, sel_fv)

                #grid search
                train_sel_name = "%s.%d"%(train_file,fnum)
                writedata(train_sel_samp,train_label,train_sel_name)
                c,g, cv_acc = train_svm(train_sel_name)
                writelog("best (c,g)= %s, cv-acc = %.6f\n"%([c,g],cv_acc))

                # REMOVE INTERMEDIATE TEMPORARY FILE: training file after selection
                rem_file(train_sel_name)

                #predict
                pred_y = predict(train_label, train_sel_samp, c, g, test_label, test_sel_samp)

                #calculate accuracy
                acc = cal_acc(pred_y, test_label)
                ##acc = cal_bacc(pred_y, test_label)
                writelog("feat# %d, testing accuracy = %.6f\n"%(fnum,acc))

                #writing predict labels
                out_name = "%s.%d.pred"%(test_file,fnum)
                fd = open(out_name, 'w')
                for y in pred_y: fd.write("%f\n"%y)
                fd.close()

                del_out_png = 0
                if del_out_png:
                        rem_file("%s.out"%train_sel_name)
                        rem_file("%s.png"%train_sel_name)


###return a dict containing F_j
def cal_Fscore(labels,samples):

        data_num=float(len(samples))
        p_num = {} #key: label;  value: data num
        sum_f = [] #index: feat_idx;  value: sum
        sum_l_f = {} #dict of lists.  key1: label; index2: feat_idx; value: sum
        sumq_l_f = {} #dict of lists.  key1: label; index2: feat_idx; value: sum of square
        F={} #key: feat_idx;  valud: fscore
        max_idx = -1

        ### pass 1: check number of each class and max index of features
        for p in range(len(samples)): # for every data point
                label=labels[p]
                point=samples[p]

                if label in p_num: p_num[label] += 1
                else: p_num[label] = 1

                for f in point.keys(): # for every feature
                        if f>max_idx: max_idx=f
        ### now p_num and max_idx are set

        ### initialize variables
        sum_f = [0 for i in range(max_idx)]
        for la in p_num.keys():
                sum_l_f[la] = [0 for i in range(max_idx)]
                sumq_l_f[la] = [0 for i in range(max_idx)]

        ### pass 2: calculate some stats of data
        for p in range(len(samples)): # for every data point
                point=samples[p]
                label=labels[p]
                for tuple in point.items(): # for every feature
                        f = tuple[0]-1 # feat index
                        v = tuple[1] # feat value
                        sum_f[f] += v
                        sum_l_f[label][f] += v
                        sumq_l_f[label][f] += v**2
        ### now sum_f, sum_l_f, sumq_l_f are done

        ### for each feature, calculate f-score
        eps = 1e-12
        for f in range(max_idx):
                SB = 0
                for la in p_num.keys():
                        SB += (p_num[la] * (sum_l_f[la][f]/p_num[la] - sum_f[f]/data_num)**2 )

                SW = eps
                for la in p_num.keys():
                        SW += (sumq_l_f[la][f] - (sum_l_f[la][f]**2)/p_num[la]) 

                F[f+1] = SB / SW

        return F


###### svm data IO ######

def readdata(filename):
        labels=[]
        samples=[]
        max_index=0
        #load training data
        fp = open(filename)
        line = fp.readline()

        while line:
                # added by untitled, allowing data with comments
                line=line.strip()
                if line[0]=="#":
                        line = fp.readline()
                        continue

                elems = line.split()
                sample = {}
                for e in elems[1:]:
                        points = e.split(":")
                        p0 = int( points[0].strip() )
                        p1 = float( points[1].strip() )
                        sample[p0] = p1
                        if p0 > max_index:
                                max_index = p0
                labels.append(float(elems[0]))
                samples.append(sample)
                line = fp.readline()
        fp.close()

        return labels,samples,max_index
def writedata(samples,labels,filename):
        fp=sys.stdout
        if filename:
                fp=open(filename,"w")

        num=len(samples)
        for i in range(num):
                if labels: 
                        fp.write("%s"%labels[i])
                else:
                        fp.write("0")
                kk=list(samples[i].keys())
                kk.sort()
                for k in kk:
                        fp.write(" %d:%f"%(k,samples[i][k]))
                fp.write("\n")

        fp.flush()
        fp.close()


###### PROGRAM ENTRY POINT ######

arg_process()

initlog("%s.select"%train_file)
writelog("start: %s\n\n"%datetime.now())
main()

# do testing on all possible feature sets
if if_predict_all :
        predict_all()

writelog("\nend: \n%s\n"%datetime.now())