utils.math.statistics
Mathematical statistics.
1""" 2Mathematical statistics. 3""" 4 5import numpy 6import pandas 7 8from typing import List, Hashable 9 10from ..logs.log import logger 11 12 13def findOutliers( 14 srs: pandas.Series, 15 sigma: int = 3, 16 joinNeighbouringOutliers: bool = True, 17 howManyCountAsNeighbours: int = 3 18) -> pandas.Series: 19 """ 20 Find outliers in the array and return the mask where outliers 21 are marked with `True`. NaNs and INFs, if any, do not count 22 as outliers. 23 24 If `joinNeighbouringOutliers` is `True`, then non-outlier values 25 between neighbouring outliers will be marked as outliers too. The same 26 goes for starting/ending elements, if they are close enough to an outlier. 27 28 Example with `joinNeighbouringOutliers` set to `True`: 29 30 ``` py 31 import pandas 32 import numpy 33 34 from phab.utils.math import statistics 35 36 srs = pandas.Series( 37 [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3] 38 ) 39 40 markedOutliers = statistics.findOutliers( 41 srs, 42 sigma=3, 43 joinNeighbouringOutliers=True, 44 howManyCountAsNeighbours=3 45 ) 46 47 for i, v in srs.items(): 48 if markedOutliers[i] == True: 49 print(f"{v} - outlier") 50 else: 51 print(f"{v} - regular") 52 # 4.0 - outlier 53 # 111.0 - outlier 54 # 4.0 - regular 55 # 4.0 - regular 56 # 5.0 - regular 57 # 6.0 - regular 58 # inf - regular 59 # 2.0 - regular 60 # 4.0 - regular 61 # 4.0 - regular 62 # nan - regular 63 # 1.0 - regular 64 # 1000000000000000.0 - outlier 65 # 4.0 - outlier 66 # 3.0 - outlier 67 # 3.0 - outlier 68 # 101.0 - outlier 69 # 2.0 - outlier 70 # 4.0 - outlier 71 # 3.0 - outlier 72 ``` 73 74 Example with `joinNeighbouringOutliers` set to `False`: 75 76 ``` py 77 # ... 78 79 markedOutliers = statistics.findOutliers( 80 srs, 81 sigma=3, 82 joinNeighbouringOutliers=False, 83 howManyCountAsNeighbours=3 84 ) 85 86 for i, v in srs.items(): 87 if markedOutliers[i] == True: 88 print(f"{v} - outlier") 89 else: 90 print(f"{v} - regular") 91 # 4.0 - regular 92 # 111.0 - outlier 93 # 4.0 - regular 94 # 4.0 - regular 95 # 5.0 - regular 96 # 6.0 - regular 97 # inf - regular 98 # 2.0 - regular 99 # 4.0 - regular 100 # 4.0 - regular 101 # nan - regular 102 # 1.0 - regular 103 # 1000000000000000.0 - outlier 104 # 4.0 - regular 105 # 3.0 - regular 106 # 3.0 - regular 107 # 101.0 - outlier 108 # 2.0 - regular 109 # 4.0 - regular 110 # 3.0 - regular 111 ``` 112 """ 113 # tbl = pandas.DataFrame({"vls": srs}) 114 tbl = pandas.DataFrame(srs) 115 116 # find NaNs/INFs 117 tbl["finite"] = numpy.isfinite(srs) 118 119 # every element is an outlier by default 120 tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool) 121 122 # NaNs and INFs are not outliers 123 # print(tbl["finite"].loc[lambda x: x == False].index) 124 tbl.loc[ 125 tbl["finite"] == False, 126 "outliers" 127 ] = False 128 129 tableFinite = tbl[tbl["finite"] == True].iloc[:, 0] 130 131 med = numpy.median(tableFinite.values) # type:ignore[arg-type] # ya hz 132 sig = 1.48 * numpy.median(numpy.abs(tableFinite - med)) # type:ignore[operator] # ya hz 133 134 for i, v in tableFinite.items(): 135 if v > med - sigma * sig: 136 tbl.at[i, "outliers"] = False 137 138 for i, v in tableFinite.items(): 139 if v >= med + sigma * sig: 140 tbl.at[i, "outliers"] = True 141 142 if not joinNeighbouringOutliers: 143 # the entire table with all columns (the original and two new ones, 144 # "finite" and "ouliers") 145 # return tbl 146 # 147 # or just two columns: first one with the values and "outliers" 148 # return tbl.iloc[:,0:3:2] 149 # 150 # or just the outliers 151 return tbl["outliers"] 152 # 153 # or just the "outliers" that are True, because it's a waste to pass 154 # around all the values, let alone the entire table 155 # return tbl[tbl["outliers"] == True]["outliers"] 156 else: 157 outlrs: pandas.Series = tbl["outliers"].copy() 158 elementsSincePreviousOutlier: int = 0 159 # 160 # later we might want to have the number of neighbours to scale 161 # with the list length, but for now it is passed as a fixed 162 # number in the `howManyCountAsNeighbours` argument of the function 163 # howManyCountAsNeighbours = round(len(outlrs.index) / 10) 164 # 165 # intuitively it should be `False`, but we need to account 166 # for possible neighbours from the very start 167 countingSincePreviousOutlier: bool = True 168 potentialNeighbourOutliers: List[Hashable] = [] 169 for i, v in outlrs.items(): 170 if v == True: 171 if countingSincePreviousOutlier: 172 # make all the previous elements to be outliers too 173 for pno in potentialNeighbourOutliers: 174 outlrs[pno] = True 175 potentialNeighbourOutliers = [] 176 elementsSincePreviousOutlier = 0 177 else: 178 countingSincePreviousOutlier = True 179 else: 180 if countingSincePreviousOutlier: 181 elementsSincePreviousOutlier += 1 182 if elementsSincePreviousOutlier > howManyCountAsNeighbours: 183 elementsSincePreviousOutlier = 0 184 countingSincePreviousOutlier = False 185 potentialNeighbourOutliers = [] 186 else: 187 potentialNeighbourOutliers.append(i) 188 # if there are some pending potential neighbour outliers 189 # after we finished iterating the list, make them outliers 190 if len(potentialNeighbourOutliers) > 0: 191 for pno in potentialNeighbourOutliers: 192 outlrs[pno] = True 193 potentialNeighbourOutliers = [] 194 return outlrs
def
findOutliers( srs: pandas.core.series.Series, sigma: int = 3, joinNeighbouringOutliers: bool = True, howManyCountAsNeighbours: int = 3) -> pandas.core.series.Series:
14def findOutliers( 15 srs: pandas.Series, 16 sigma: int = 3, 17 joinNeighbouringOutliers: bool = True, 18 howManyCountAsNeighbours: int = 3 19) -> pandas.Series: 20 """ 21 Find outliers in the array and return the mask where outliers 22 are marked with `True`. NaNs and INFs, if any, do not count 23 as outliers. 24 25 If `joinNeighbouringOutliers` is `True`, then non-outlier values 26 between neighbouring outliers will be marked as outliers too. The same 27 goes for starting/ending elements, if they are close enough to an outlier. 28 29 Example with `joinNeighbouringOutliers` set to `True`: 30 31 ``` py 32 import pandas 33 import numpy 34 35 from phab.utils.math import statistics 36 37 srs = pandas.Series( 38 [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3] 39 ) 40 41 markedOutliers = statistics.findOutliers( 42 srs, 43 sigma=3, 44 joinNeighbouringOutliers=True, 45 howManyCountAsNeighbours=3 46 ) 47 48 for i, v in srs.items(): 49 if markedOutliers[i] == True: 50 print(f"{v} - outlier") 51 else: 52 print(f"{v} - regular") 53 # 4.0 - outlier 54 # 111.0 - outlier 55 # 4.0 - regular 56 # 4.0 - regular 57 # 5.0 - regular 58 # 6.0 - regular 59 # inf - regular 60 # 2.0 - regular 61 # 4.0 - regular 62 # 4.0 - regular 63 # nan - regular 64 # 1.0 - regular 65 # 1000000000000000.0 - outlier 66 # 4.0 - outlier 67 # 3.0 - outlier 68 # 3.0 - outlier 69 # 101.0 - outlier 70 # 2.0 - outlier 71 # 4.0 - outlier 72 # 3.0 - outlier 73 ``` 74 75 Example with `joinNeighbouringOutliers` set to `False`: 76 77 ``` py 78 # ... 79 80 markedOutliers = statistics.findOutliers( 81 srs, 82 sigma=3, 83 joinNeighbouringOutliers=False, 84 howManyCountAsNeighbours=3 85 ) 86 87 for i, v in srs.items(): 88 if markedOutliers[i] == True: 89 print(f"{v} - outlier") 90 else: 91 print(f"{v} - regular") 92 # 4.0 - regular 93 # 111.0 - outlier 94 # 4.0 - regular 95 # 4.0 - regular 96 # 5.0 - regular 97 # 6.0 - regular 98 # inf - regular 99 # 2.0 - regular 100 # 4.0 - regular 101 # 4.0 - regular 102 # nan - regular 103 # 1.0 - regular 104 # 1000000000000000.0 - outlier 105 # 4.0 - regular 106 # 3.0 - regular 107 # 3.0 - regular 108 # 101.0 - outlier 109 # 2.0 - regular 110 # 4.0 - regular 111 # 3.0 - regular 112 ``` 113 """ 114 # tbl = pandas.DataFrame({"vls": srs}) 115 tbl = pandas.DataFrame(srs) 116 117 # find NaNs/INFs 118 tbl["finite"] = numpy.isfinite(srs) 119 120 # every element is an outlier by default 121 tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool) 122 123 # NaNs and INFs are not outliers 124 # print(tbl["finite"].loc[lambda x: x == False].index) 125 tbl.loc[ 126 tbl["finite"] == False, 127 "outliers" 128 ] = False 129 130 tableFinite = tbl[tbl["finite"] == True].iloc[:, 0] 131 132 med = numpy.median(tableFinite.values) # type:ignore[arg-type] # ya hz 133 sig = 1.48 * numpy.median(numpy.abs(tableFinite - med)) # type:ignore[operator] # ya hz 134 135 for i, v in tableFinite.items(): 136 if v > med - sigma * sig: 137 tbl.at[i, "outliers"] = False 138 139 for i, v in tableFinite.items(): 140 if v >= med + sigma * sig: 141 tbl.at[i, "outliers"] = True 142 143 if not joinNeighbouringOutliers: 144 # the entire table with all columns (the original and two new ones, 145 # "finite" and "ouliers") 146 # return tbl 147 # 148 # or just two columns: first one with the values and "outliers" 149 # return tbl.iloc[:,0:3:2] 150 # 151 # or just the outliers 152 return tbl["outliers"] 153 # 154 # or just the "outliers" that are True, because it's a waste to pass 155 # around all the values, let alone the entire table 156 # return tbl[tbl["outliers"] == True]["outliers"] 157 else: 158 outlrs: pandas.Series = tbl["outliers"].copy() 159 elementsSincePreviousOutlier: int = 0 160 # 161 # later we might want to have the number of neighbours to scale 162 # with the list length, but for now it is passed as a fixed 163 # number in the `howManyCountAsNeighbours` argument of the function 164 # howManyCountAsNeighbours = round(len(outlrs.index) / 10) 165 # 166 # intuitively it should be `False`, but we need to account 167 # for possible neighbours from the very start 168 countingSincePreviousOutlier: bool = True 169 potentialNeighbourOutliers: List[Hashable] = [] 170 for i, v in outlrs.items(): 171 if v == True: 172 if countingSincePreviousOutlier: 173 # make all the previous elements to be outliers too 174 for pno in potentialNeighbourOutliers: 175 outlrs[pno] = True 176 potentialNeighbourOutliers = [] 177 elementsSincePreviousOutlier = 0 178 else: 179 countingSincePreviousOutlier = True 180 else: 181 if countingSincePreviousOutlier: 182 elementsSincePreviousOutlier += 1 183 if elementsSincePreviousOutlier > howManyCountAsNeighbours: 184 elementsSincePreviousOutlier = 0 185 countingSincePreviousOutlier = False 186 potentialNeighbourOutliers = [] 187 else: 188 potentialNeighbourOutliers.append(i) 189 # if there are some pending potential neighbour outliers 190 # after we finished iterating the list, make them outliers 191 if len(potentialNeighbourOutliers) > 0: 192 for pno in potentialNeighbourOutliers: 193 outlrs[pno] = True 194 potentialNeighbourOutliers = [] 195 return outlrs
Find outliers in the array and return the mask where outliers
are marked with True. NaNs and INFs, if any, do not count
as outliers.
If joinNeighbouringOutliers is True, then non-outlier values
between neighbouring outliers will be marked as outliers too. The same
goes for starting/ending elements, if they are close enough to an outlier.
Example with joinNeighbouringOutliers set to True:
import pandas
import numpy
from phab.utils.math import statistics
srs = pandas.Series(
[4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3]
)
markedOutliers = statistics.findOutliers(
srs,
sigma=3,
joinNeighbouringOutliers=True,
howManyCountAsNeighbours=3
)
for i, v in srs.items():
if markedOutliers[i] == True:
print(f"{v} - outlier")
else:
print(f"{v} - regular")
# 4.0 - outlier
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - outlier
# 3.0 - outlier
# 3.0 - outlier
# 101.0 - outlier
# 2.0 - outlier
# 4.0 - outlier
# 3.0 - outlier
Example with joinNeighbouringOutliers set to False:
# ...
markedOutliers = statistics.findOutliers(
srs,
sigma=3,
joinNeighbouringOutliers=False,
howManyCountAsNeighbours=3
)
for i, v in srs.items():
if markedOutliers[i] == True:
print(f"{v} - outlier")
else:
print(f"{v} - regular")
# 4.0 - regular
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - regular
# 3.0 - regular
# 3.0 - regular
# 101.0 - outlier
# 2.0 - regular
# 4.0 - regular
# 3.0 - regular