2015 Теормин (1185421), страница 5
Текст из файла (страница 5)
>>> log2.registerTempTable('log2')
>>> sqlContext.sql('select count(*) from log2')
DataFrame[_c0: bigint]
>>> sqlContext.sql('select count(*) from log2').collect()
[Row(_c0=1000000)]
>>> sqlContext.sql('select * from log2 where domain like \'%lenta.ru\' order by dt desc limit 3').collect()
Row(domain='lenta.ru', dt=20141129, month=201411, times='84755', url='http://lenta.ru/news/2013/11/29/love/', user_id='1235907'), Row(domain='lenta.ru', dt=20141129, month=201411, times='74491', url='http://lenta.ru/news/2013/11/29/love/', user_id='1724463'), Row(domain='lenta.ru', dt=20141129, month=201411, times='73832', url='http://lenta.ru/news/2013/11/29/block/', user_id='1724463')]
>>> sqlContext.sql('select user_id, domain, month, count(domain) over (partition by month, domain), count(domain) over (partition by month, domain, user_id) from log2').show(3)
+-------+--------------------+------+---+---+
|user_id| domain| month|_c3|_c4|
+-------+--------------------+------+---+---+
| 385625| 10.0.110.245|201410| 1| 1|
| 447272|1119997.1001golos.ru|201410| 1| 1|
| 447272|1125714.1001golos.ru|201410| 1| 1|
+-------+--------------------+------+---+---+
>>>