173 lines
4.9 KiB
Python
173 lines
4.9 KiB
Python
import pandas as pd
|
|
from scipy import stats
|
|
import numpy as np
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
import glob
|
|
import math
|
|
from collections import OrderedDict
|
|
from functools import partial
|
|
from decimal import *
|
|
|
|
|
|
data='./3classdata/W/1.csv'
|
|
#data='./progetto/3classdata/Y/2.csv'
|
|
window=3
|
|
resolution=500
|
|
shift=3
|
|
|
|
|
|
def round(n):
|
|
pst = '0.00001'
|
|
precision = Decimal(pst)
|
|
getcontext().prec = len(pst)
|
|
return Decimal(n).quantize(precision)
|
|
assert round(5.00000000000000001)
|
|
|
|
# 1. read file
|
|
def read_csv(document):
|
|
df = pd.read_csv(document, dtype=float, header=None)
|
|
# rotate dataframe (each column is a temporal sequence)
|
|
df = df.T.reset_index().reindex(axis=1).drop(['index'], axis=1).infer_objects()
|
|
return df
|
|
|
|
# 2
|
|
def normalize(df):
|
|
df = ((df - df.min()) / (df.max() - df.min()))*2 - 1
|
|
return df
|
|
|
|
# 3 gaussian + discretize
|
|
def gaussian_interval(resolution):
|
|
# use a gaussian distribution, mean 0, std deviation 0.25
|
|
distribution = stats.norm(loc=0, scale=0.25)
|
|
# bounds of range for inverse comulative distribution function
|
|
bounds = distribution.cdf([-1, 1])
|
|
# generate linear space of 2*resolution points using bounds
|
|
linsp = np.linspace(*bounds, num=2*resolution)
|
|
# obtain the array of 2*resolution points
|
|
return distribution.ppf(linsp)
|
|
|
|
def discretize(df, interval, how="left"):
|
|
if how == "left":
|
|
return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.left))
|
|
elif how == "right":
|
|
return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.right))
|
|
else:
|
|
assert False, "specify left / right"
|
|
|
|
def parallel(n=4, target=None, iterable=None, **kwargs):
|
|
assert target is not None
|
|
assert iterable is not None
|
|
|
|
fun = partial(target, **kwargs)
|
|
|
|
from multiprocessing import Pool
|
|
p = Pool(n)
|
|
if type(iterable) is OrderedDict:
|
|
values = p.map(fun, iterable.values())
|
|
return OrderedDict((k,v) for k,v in zip(iterable.keys(), values))
|
|
else:
|
|
return p.map(fun, iterable)
|
|
|
|
|
|
# rolling window generates words
|
|
def generate_words(col, w, s):
|
|
word = []
|
|
idx = 0
|
|
distance = abs(w-s)
|
|
while idx < len(col):
|
|
if len(word) == w:
|
|
yield tuple(word)
|
|
word = []
|
|
if w < s and idx == w:
|
|
idx += distance
|
|
if w > s:
|
|
idx -= distance
|
|
else:
|
|
word.append(round(col[idx]))
|
|
idx+=1
|
|
|
|
# designed to be used on a df
|
|
# use 'parallel' for dfs
|
|
def rolling_window(df, w, s):
|
|
ret = list()
|
|
for _,col in df.iteritems():
|
|
ret.append(tuple(generate_words(col, w, s)))
|
|
return pd.DataFrame(ret).T.reset_index().reindex(axis=1).drop(['index'], axis=1)
|
|
|
|
# count term frequency of columns
|
|
def text_freq(col):
|
|
d = dict()
|
|
for word, cnt in col.value_counts().iteritems():
|
|
hashable = word # hashable to be used as index
|
|
if hashable in d:
|
|
d[hashable] += cnt
|
|
else:
|
|
d[hashable] = cnt
|
|
return d
|
|
|
|
# designed to be used on a df
|
|
# use 'parallel' for dfs
|
|
def extract_words(df, how=None, nwords=3):
|
|
colList = list()
|
|
for _,col in df.iteritems():
|
|
freq = how(col)
|
|
d = dict()
|
|
for entry in col:
|
|
d[entry] = freq[entry]
|
|
colList.append(tuple(sorted(d, key=lambda x : d[x])[-nwords:]))
|
|
return pd.DataFrame(colList).T
|
|
|
|
|
|
def term_in_df(df, term):
|
|
cnt = 0
|
|
for _,col in df.iteritems():
|
|
if term in col:
|
|
cnt += 1
|
|
print(col)
|
|
assert cnt <= 20
|
|
return cnt
|
|
|
|
def funLog(word, length, sets):
|
|
return (word, math.log10(length / sum([s.__contains__(word) for s in sets])))
|
|
|
|
def idf(dfs):
|
|
words = set()
|
|
D = list()
|
|
for df in dfs:
|
|
for _,col in df.iteritems():
|
|
D.append(set(col.values))
|
|
words.update(col.values)
|
|
|
|
idfs = parallel(target=funLog, iterable=words,
|
|
length=len(D), sets=D, n=12)
|
|
return dict(idfs)
|
|
|
|
def _chunker(seq, size):
|
|
assert type(seq) is list
|
|
return (seq[pos: pos+size] for pos in range(0, len(seq), size))
|
|
|
|
def tfidf(odct, nwords=3):
|
|
assert type(odct) is OrderedDict
|
|
lst = list(odct.values())
|
|
idfs = idf(lst) # compute idf of each dataframe, in parallel
|
|
tfs = list()
|
|
for df in lst:
|
|
for _, col in df.iteritems():
|
|
tfs.append(text_freq(col)) # associate every word with tf for each sensor
|
|
|
|
tf_idfs = list()
|
|
for tf in tfs:
|
|
d = dict()
|
|
for word, t in tf.items():
|
|
d[word] = t * idfs[word]
|
|
tf_idfs.append(d)
|
|
|
|
dflist = list() # extract best three
|
|
for sensors in _chunker(tf_idfs, 20):
|
|
tidList = list()
|
|
for s in sensors:
|
|
bestThree = sorted(s, key=lambda x:s[x])[-nwords:]
|
|
tidList.append(tuple(list(bestThree)))
|
|
dflist.append(pd.DataFrame(tidList).T)
|
|
return OrderedDict((k,v) for k,v in zip(odct.keys(), dflist))
|