UniTO/anno2/YearI/SecondSem/BDM/progetto/newsource/libword.py
Francesco Mecca 5e286062f8 MCAD 2019
2018-11-22 13:09:11 +01:00

173 lines
4.9 KiB
Python

import pandas as pd
from scipy import stats
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import glob
import math
from collections import OrderedDict
from functools import partial
from decimal import *
data='./3classdata/W/1.csv'
#data='./progetto/3classdata/Y/2.csv'
window=3
resolution=500
shift=3
def round(n):
pst = '0.00001'
precision = Decimal(pst)
getcontext().prec = len(pst)
return Decimal(n).quantize(precision)
assert round(5.00000000000000001)
# 1. read file
def read_csv(document):
df = pd.read_csv(document, dtype=float, header=None)
# rotate dataframe (each column is a temporal sequence)
df = df.T.reset_index().reindex(axis=1).drop(['index'], axis=1).infer_objects()
return df
# 2
def normalize(df):
df = ((df - df.min()) / (df.max() - df.min()))*2 - 1
return df
# 3 gaussian + discretize
def gaussian_interval(resolution):
# use a gaussian distribution, mean 0, std deviation 0.25
distribution = stats.norm(loc=0, scale=0.25)
# bounds of range for inverse comulative distribution function
bounds = distribution.cdf([-1, 1])
# generate linear space of 2*resolution points using bounds
linsp = np.linspace(*bounds, num=2*resolution)
# obtain the array of 2*resolution points
return distribution.ppf(linsp)
def discretize(df, interval, how="left"):
if how == "left":
return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.left))
elif how == "right":
return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.right))
else:
assert False, "specify left / right"
def parallel(n=4, target=None, iterable=None, **kwargs):
assert target is not None
assert iterable is not None
fun = partial(target, **kwargs)
from multiprocessing import Pool
p = Pool(n)
if type(iterable) is OrderedDict:
values = p.map(fun, iterable.values())
return OrderedDict((k,v) for k,v in zip(iterable.keys(), values))
else:
return p.map(fun, iterable)
# rolling window generates words
def generate_words(col, w, s):
word = []
idx = 0
distance = abs(w-s)
while idx < len(col):
if len(word) == w:
yield tuple(word)
word = []
if w < s and idx == w:
idx += distance
if w > s:
idx -= distance
else:
word.append(round(col[idx]))
idx+=1
# designed to be used on a df
# use 'parallel' for dfs
def rolling_window(df, w, s):
ret = list()
for _,col in df.iteritems():
ret.append(tuple(generate_words(col, w, s)))
return pd.DataFrame(ret).T.reset_index().reindex(axis=1).drop(['index'], axis=1)
# count term frequency of columns
def text_freq(col):
d = dict()
for word, cnt in col.value_counts().iteritems():
hashable = word # hashable to be used as index
if hashable in d:
d[hashable] += cnt
else:
d[hashable] = cnt
return d
# designed to be used on a df
# use 'parallel' for dfs
def extract_words(df, how=None, nwords=3):
colList = list()
for _,col in df.iteritems():
freq = how(col)
d = dict()
for entry in col:
d[entry] = freq[entry]
colList.append(tuple(sorted(d, key=lambda x : d[x])[-nwords:]))
return pd.DataFrame(colList).T
def term_in_df(df, term):
cnt = 0
for _,col in df.iteritems():
if term in col:
cnt += 1
print(col)
assert cnt <= 20
return cnt
def funLog(word, length, sets):
return (word, math.log10(length / sum([s.__contains__(word) for s in sets])))
def idf(dfs):
words = set()
D = list()
for df in dfs:
for _,col in df.iteritems():
D.append(set(col.values))
words.update(col.values)
idfs = parallel(target=funLog, iterable=words,
length=len(D), sets=D, n=12)
return dict(idfs)
def _chunker(seq, size):
assert type(seq) is list
return (seq[pos: pos+size] for pos in range(0, len(seq), size))
def tfidf(odct, nwords=3):
assert type(odct) is OrderedDict
lst = list(odct.values())
idfs = idf(lst) # compute idf of each dataframe, in parallel
tfs = list()
for df in lst:
for _, col in df.iteritems():
tfs.append(text_freq(col)) # associate every word with tf for each sensor
tf_idfs = list()
for tf in tfs:
d = dict()
for word, t in tf.items():
d[word] = t * idfs[word]
tf_idfs.append(d)
dflist = list() # extract best three
for sensors in _chunker(tf_idfs, 20):
tidList = list()
for s in sensors:
bestThree = sorted(s, key=lambda x:s[x])[-nwords:]
tidList.append(tuple(list(bestThree)))
dflist.append(pd.DataFrame(tidList).T)
return OrderedDict((k,v) for k,v in zip(odct.keys(), dflist))