количестве строк в оригинальных данных он:
1. тормозит так как судя по всему один массив обрабатывается в одном потоке всегда
2. падает по ООМ (жрет память как не в себя когда делает биннинг, пометил комментом), так как он по каким-то причинам не спилится (может потому что груп бай1 посередине и все в одну строку складывается).
Ваши мысли, господа?
select
arrMeasuredViolatorsAterFiltersPerPKTuple.1.1 as key1,
arrMeasuredViolatorsAterFiltersPerPKTuple.1.2 as key2,
arrayReduce('avgIf',arrMeasuredViolatorsAterFiltersPerPKTuple.2.3, arrayMap(x-> x !=-1, arrMeasuredViolatorsAterFiltersPerPKTuple.2.3 )) avgM1,
arrayReduce('avgIf',arrMeasuredViolatorsAterFiltersPerPKTuple.2.3, arrayMap(x-> x !=-1, arrMeasuredViolatorsAterFiltersPerPKTuple.2.4 )) avgM2,
length(arrMeasuredViolatorsAterFiltersPerPKTuple.2) NumOfViolations,
toString(arrMeasuredViolatorsAterFiltersPerPKTuple) debugStr
from
(
select
arrayFlatten(groupArrayIf(measArr, length(measArr)>0)) as measArrFlattened,
-- length(measArrFlattened),
arrayReduce('groupUniqArrayIf', measArrFlattened.2, measArrFlattened.5) as arrDistinctViolators,
-- length(arrDistinctViolators),
arrayFilter(x -> has(arrDistinctViolators, x.2), measArrFlattened) as arrViolatorsMeas,
toString(arrViolatorsMeas),
arrayReduce('sumMap', [arrViolatorsMeas.2], [arrViolatorsMeas.5]) as violatorsWithViolationCnts,
arrayReduce('sumMap', [arrViolatorsMeas.2], [replicate(1,arrViolatorsMeas)]) as violatorsWithTotalMeasCounts,
arrayZip(violatorsWithViolationCnts.1, violatorsWithViolationCnts.2, violatorsWithTotalMeasCounts.2) violatorsWithViolAndTotalMeasCounts,
toString(violatorsWithViolAndTotalMeasCounts),
-- only consider 50% violation measurements as violator
arrayFilter(x-> (x.2*100/x.3)>= 50, violatorsWithViolAndTotalMeasCounts) violatorsAfterFilters,
-- toString(violatorsAfterFilters),
arrayFilter(x -> has(violatorsAfterFilters.1, x.2), arrViolatorsMeas) as arrViolatorsMeasAfterFilters,
toString(arrViolatorsMeasAfterFilters),
arrayDistinct(arrayMap( (x) -> (x.1,x.2), arrViolatorsMeasAfterFilters)) as arrPKTuple,
-- this one makes query eat a lot of memory and be very slow
arrayMap(
(x) -> (x, arrayFilter( (y) -> x.1=y.1 and x.2=y.2, arrViolatorsMeasAfterFilters) ),
arrPKTuple
) as arrMeasuredViolatorsAterFiltersPerPKTuple,
toString(arrMeasuredViolatorsAterFiltersPerPKTuple)
from (
WITH 100 as keycnt, 0.1 as violatorFraction, 10 as metricRange, 50 as violationProbPct,
(cityHash64(number, 10)%keycnt) as key00,
(cityHash64(number, 11)%keycnt) as key12,
(cityHash64(number, 12)%metricRange) as metric11,
(cityHash64(number, 13)%metricRange) as metric12,
if(cityHash64(number, 14)%100<=violationProbPct and key12<(keycnt*violatorFraction), 1,0) as metric13,
(cityHash64(number, 21)%keycnt) as key22,
(cityHash64(number, 22)%metricRange) as metric21,
(cityHash64(number, 23)%metricRange) as metric22,
if(cityHash64(number, 24)%100<=violationProbPct and key22<(keycnt*violatorFraction), 1,0) as metric23,
(cityHash64(number, 31)%keycnt) as key32,
(cityHash64(number, 32)%metricRange) as metric31,
(cityHash64(number, 33)%metricRange) as metric32,
if(cityHash64(number, 34)%100<=violationProbPct and key32<(keycnt*violatorFraction),1,0) as metric33
select
key12,key22,key32,
cityHash64(number, 14)%100,
cityHash64(number, 24)%100,
cityHash64(number, 34)%100,
metric13,
metric23,
metric33,
tuple(key00,key12,metric11,metric12,metric13) as meas1,
tuple(key00,key22,metric21,metric22,metric23) as meas2,
tuple(key00,key32,metric31,metric32,metric33) as meas3,
arrayFilter(
x-> x.3>2
,[meas1, meas2, meas3]
) as measArr,
toString(measArr), length(measArr)
from numbers(1000)
)
) ARRAY JOIN arrMeasuredViolatorsAterFiltersPerPKTuple;
— from numbers(1000) на миллионе уже 3.28GiB память (почти не думает) на 2 миллионах 6,57 и думает лишние 3 секунды
Обсуждают сегодня