UniTO/anno2/YearI/SecondSem/BDM/progetto/newsource/libword.py

import pandas as pd
from scipy import stats
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import glob
import math
from collections import OrderedDict
from functools import partial
from decimal import *


data='./3classdata/W/1.csv'
#data='./progetto/3classdata/Y/2.csv'
window=3
resolution=500
shift=3


def round(n):
    pst = '0.00001'
    precision = Decimal(pst)
    getcontext().prec = len(pst)
    return Decimal(n).quantize(precision)
assert round(5.00000000000000001)

# 1. read file
def read_csv(document):
    df = pd.read_csv(document, dtype=float, header=None)
    # rotate dataframe (each column is a temporal sequence)
    df = df.T.reset_index().reindex(axis=1).drop(['index'], axis=1).infer_objects()
    return df

# 2
def normalize(df):
    df = ((df - df.min()) / (df.max() - df.min()))*2 - 1
    return df

# 3 gaussian + discretize
def gaussian_interval(resolution):
    # use a gaussian distribution, mean 0, std deviation 0.25
    distribution = stats.norm(loc=0, scale=0.25)
    # bounds of range for inverse comulative distribution function
    bounds = distribution.cdf([-1, 1])
    # generate linear space of 2*resolution points using bounds
    linsp = np.linspace(*bounds, num=2*resolution)
    # obtain the array of 2*resolution points
    return distribution.ppf(linsp)

def discretize(df, interval, how="left"):
    if how == "left":
        return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.left))
    elif how == "right":
        return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.right))
    else:
        assert False, "specify left / right"

def parallel(n=4, target=None, iterable=None, **kwargs):
    assert target is not None
    assert iterable is not None

    fun = partial(target, **kwargs)

    from multiprocessing import Pool
    p = Pool(n)
    if type(iterable) is OrderedDict:
        values = p.map(fun, iterable.values())
        return OrderedDict((k,v) for k,v in zip(iterable.keys(), values))
    else:
        return p.map(fun, iterable)


# rolling window generates words
def generate_words(col, w, s):
    word = []
    idx = 0
    distance = abs(w-s)
    while idx < len(col):
        if len(word) == w:
            yield tuple(word)
            word = []
            if w < s and idx == w:
                idx += distance
            if w > s:
                idx -= distance
        else:
            word.append(round(col[idx]))
            idx+=1

# designed to be used on a df
# use 'parallel' for dfs
def rolling_window(df, w, s):
    ret = list()
    for _,col in df.iteritems():
        ret.append(tuple(generate_words(col, w, s)))
    return pd.DataFrame(ret).T.reset_index().reindex(axis=1).drop(['index'], axis=1)

# count term frequency of columns
def text_freq(col):
    d = dict()
    for word, cnt in col.value_counts().iteritems():
        hashable = word # hashable to be used as index
        if hashable in d:
            d[hashable] += cnt
        else:
            d[hashable] = cnt
    return d

# designed to be used on a df
# use 'parallel' for dfs
def extract_words(df, how=None, nwords=3):
    colList = list()
    for _,col in df.iteritems():
        freq = how(col)
        d = dict()
        for entry in col:
            d[entry] = freq[entry]
        colList.append(tuple(sorted(d, key=lambda x : d[x])[-nwords:]))
    return pd.DataFrame(colList).T


def term_in_df(df, term):
    cnt = 0
    for _,col in df.iteritems():
        if term in col:
            cnt += 1
            print(col)
    assert cnt <= 20
    return cnt

def funLog(word, length, sets):
    return (word, math.log10(length / sum([s.__contains__(word) for s in sets])))

def idf(dfs):
    words = set()
    D = list()
    for df in dfs:
        for _,col in df.iteritems():
            D.append(set(col.values))
            words.update(col.values)

    idfs = parallel(target=funLog, iterable=words,
            length=len(D), sets=D, n=12)
    return dict(idfs)

def _chunker(seq, size):
    assert type(seq) is list
    return (seq[pos: pos+size] for pos in range(0, len(seq), size))

def tfidf(odct, nwords=3):
    assert type(odct) is OrderedDict
    lst = list(odct.values())
    idfs = idf(lst) # compute idf of each dataframe, in parallel
    tfs = list()
    for df in lst:
        for _, col in df.iteritems():
            tfs.append(text_freq(col)) # associate every word with tf for each sensor

    tf_idfs = list()
    for tf in tfs:
        d = dict()
        for word, t in tf.items():
            d[word] = t * idfs[word]
        tf_idfs.append(d)

    dflist = list() # extract best three
    for sensors in _chunker(tf_idfs, 20):
        tidList = list()
        for s in sensors:
            bestThree = sorted(s, key=lambda x:s[x])[-nwords:]
            tidList.append(tuple(list(bestThree)))
        dflist.append(pd.DataFrame(tidList).T)
    return OrderedDict((k,v) for k,v in zip(odct.keys(), dflist))