import pandas as pd from scipy import stats import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import glob import math from collections import OrderedDict from functools import partial from decimal import * data='./3classdata/W/1.csv' #data='./progetto/3classdata/Y/2.csv' window=3 resolution=500 shift=3 def round(n): pst = '0.00001' precision = Decimal(pst) getcontext().prec = len(pst) return Decimal(n).quantize(precision) assert round(5.00000000000000001) # 1. read file def read_csv(document): df = pd.read_csv(document, dtype=float, header=None) # rotate dataframe (each column is a temporal sequence) df = df.T.reset_index().reindex(axis=1).drop(['index'], axis=1).infer_objects() return df # 2 def normalize(df): df = ((df - df.min()) / (df.max() - df.min()))*2 - 1 return df # 3 gaussian + discretize def gaussian_interval(resolution): # use a gaussian distribution, mean 0, std deviation 0.25 distribution = stats.norm(loc=0, scale=0.25) # bounds of range for inverse comulative distribution function bounds = distribution.cdf([-1, 1]) # generate linear space of 2*resolution points using bounds linsp = np.linspace(*bounds, num=2*resolution) # obtain the array of 2*resolution points return distribution.ppf(linsp) def discretize(df, interval, how="left"): if how == "left": return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.left)) elif how == "right": return df.apply(lambda x : pd.cut(x, interval, right=True, precision=5).apply(lambda l : l.right)) else: assert False, "specify left / right" def parallel(n=4, target=None, iterable=None, **kwargs): assert target is not None assert iterable is not None fun = partial(target, **kwargs) from multiprocessing import Pool p = Pool(n) if type(iterable) is OrderedDict: values = p.map(fun, iterable.values()) return OrderedDict((k,v) for k,v in zip(iterable.keys(), values)) else: return p.map(fun, iterable) # rolling window generates words def generate_words(col, w, s): word = [] idx = 0 distance = abs(w-s) while idx < len(col): if len(word) == w: yield tuple(word) word = [] if w < s and idx == w: idx += distance if w > s: idx -= distance else: word.append(round(col[idx])) idx+=1 # designed to be used on a df # use 'parallel' for dfs def rolling_window(df, w, s): ret = list() for _,col in df.iteritems(): ret.append(tuple(generate_words(col, w, s))) return pd.DataFrame(ret).T.reset_index().reindex(axis=1).drop(['index'], axis=1) # count term frequency of columns def text_freq(col): d = dict() for word, cnt in col.value_counts().iteritems(): hashable = word # hashable to be used as index if hashable in d: d[hashable] += cnt else: d[hashable] = cnt return d # designed to be used on a df # use 'parallel' for dfs def extract_words(df, how=None, nwords=3): colList = list() for _,col in df.iteritems(): freq = how(col) d = dict() for entry in col: d[entry] = freq[entry] colList.append(tuple(sorted(d, key=lambda x : d[x])[-nwords:])) return pd.DataFrame(colList).T def term_in_df(df, term): cnt = 0 for _,col in df.iteritems(): if term in col: cnt += 1 print(col) assert cnt <= 20 return cnt def funLog(word, length, sets): return (word, math.log10(length / sum([s.__contains__(word) for s in sets]))) def idf(dfs): words = set() D = list() for df in dfs: for _,col in df.iteritems(): D.append(set(col.values)) words.update(col.values) idfs = parallel(target=funLog, iterable=words, length=len(D), sets=D, n=12) return dict(idfs) def _chunker(seq, size): assert type(seq) is list return (seq[pos: pos+size] for pos in range(0, len(seq), size)) def tfidf(odct, nwords=3): assert type(odct) is OrderedDict lst = list(odct.values()) idfs = idf(lst) # compute idf of each dataframe, in parallel tfs = list() for df in lst: for _, col in df.iteritems(): tfs.append(text_freq(col)) # associate every word with tf for each sensor tf_idfs = list() for tf in tfs: d = dict() for word, t in tf.items(): d[word] = t * idfs[word] tf_idfs.append(d) dflist = list() # extract best three for sensors in _chunker(tf_idfs, 20): tidList = list() for s in sensors: bestThree = sorted(s, key=lambda x:s[x])[-nwords:] tidList.append(tuple(list(bestThree))) dflist.append(pd.DataFrame(tidList).T) return OrderedDict((k,v) for k,v in zip(odct.keys(), dflist))