/* ************************ * 5.2. Távolságeloszlás * **************************/ /* Aki emlitett minket legalabb n-szer: barat. Baratok eloszlasa kulonbozo n-eknel. */ SELECT Friends, COUNT(*) as Occurance FROM ( SELECT TOP 10000 [mentioned_user_id], COUNT(*) as Friends FROM [Twitter].[dbo].[user_mention] WHERE count > (n-1) AND run_id = 1001 GROUP BY mentioned_user_id ) s GROUP BY Friends ORDER BY Friends /* Egyirányú említések leszámolása */ SELECT run_id, user_id, mentioned_user_id, COUNT(*) as count /* INTO [rudolf].[dbo].[mention_counts] */ FROM [Twitter].[dbo].[tweet_user_mention] GROUP BY run_id, user_id, mentioned_user_id /* Egymást kölcsönösen említő userek listázása, számolva aszerint, hogy hányszor említették egymást. */ CREATE VIEW [dbo].[user_mutual_mention_count] AS SELECT *, COUNT(*) as mention_count FROM (SELECT a.run_id, a.user_id AS user_a_id, b.user_id AS user_b_id FROM [Twitter].[dbo].[tweet_user_mention] a INNER JOIN [Twitter].[dbo].[tweet_user_mention] b ON a.run_id = b.run_id AND a.user_id = b.mentioned_user_id AND a.mentioned_user_id = b.user_id WHERE a.user_id < b.user_id) s GROUP BY run_id, user_a_id, user_b_id /* Az előzőhöz koordináték joinolása */ CREATE VIEW [dbo].[mutual_mention_locations] AS WITH locations (run_id, USER_ID, lat, lon, cx, cy, cz) AS ( SELECT run_id, USER_ID, lat, lon, cx, cy, cz FROM [jszule].[dbo].[user_location] WHERE cluster_id = 0 ) SELECT TOP 100 [rudolf].[dbo].[user_mutual_mention_count].user_a_id, [rudolf].[dbo].[user_mutual_mention_count].user_b_id, [rudolf].[dbo].[user_mutual_mention_count].mention_count, locations_a.cx AS cx_a, locations_a.cy AS cy_a, locations_a.cz AS cz_a, locations_b.cx AS cx_b, locations_b.cy AS cy_b, locations_b.cz AS cz_b FROM ([rudolf].[dbo].[user_mutual_mention_count] JOIN locations AS locations_a ON [rudolf].[dbo].[user_mutual_mention_count].user_a_id = locations_a.USER_ID) JOIN locations AS locations_b ON [rudolf].[dbo].[user_mutual_mention_count].user_b_id = locations_b.USER_ID /* Egyiranyu emlitesek leszamolasa */ SELECT TOP 1000 [run_id] ,[user_id] ,[mentioned_user_id], COUNT(*) as count FROM [Twitter].[dbo].[tweet_user_mention] GROUP BY run_id, user_id, mentioned_user_id /* Összes user listázása egygráfból (itt mention) EZ GYORSABB */ SELECT DISTINCT * FROM ( SELECT user_a_id FROM [rudolf].[dbo].[mutual_mention_locatins_counted] UNION ALL SELECT user_b_id FROM [rudolf].[dbo].[mutual_mention_locatins_counted] ) s /* VAGY */ SELECT user_a_id FROM [rudolf].[dbo].[mutual_mention_locatins_counted] UNION SELECT user_b_id FROM [rudolf].[dbo].[mutual_mention_locatins_counted] /* Twitter_COLD-ból az egyirányú említések számolása + kölcsönös user gráfba ezek berakása + JOIN a koordinátákkal */ /* Először */ SELECT [run_id] ,[user_id] ,[mentioned_user_id] ,COUNT(*) as cnt INTO [rudolf].[dbo].[mention_counts_COLD] FROM [Twitter_COLD].[dbo].[tweet_user_mention] GROUP BY run_id, user_id, mentioned_user_id /* Aztán: */ SELECT graph.run_id as run_id, graph.user_a_id as user_a_id, graph.user_b_id as user_b_id, count1.cnt as atob, count2.cnt as btoa INTO [rudolf].[dbo].[mention_mutual_count_COLD] FROM [Twitter_COLD].[dbo].[user_mention_mutual] as graph JOIN [rudolf].[dbo].[mention_counts_COLD] as count1 ON graph.run_id = count1.run_id AND graph.user_a_id = count1.user_id AND graph.user_b_id = count1.mentioned_user_id JOIN [rudolf].[dbo].[mention_counts_COLD] as count2 ON graph.run_id = count1.run_id AND graph.user_b_id = count1.user_id AND graph.user_a_id = count1.mentioned_user_id /* Végül: */ WITH locations (run_id, USER_ID, lat, lon, cx, cy, cz) AS ( SELECT run_id, USER_ID, lat, lon, cx, cy, cz FROM [jszule].[dbo].[user_location] WHERE cluster_id = 0 ) SELECT graph.run_id as run_id graph.user_a_id as user_a_is, graph.user_b_id as user_b_id, graph.atob as atob, graph.btoa as btoa, locations_a.cx AS cx_a, locations_a.cy AS cy_a, locations_a.cz AS cz_a, locations_a.lon AS lon_a, locations_a.lat AS lat_a, locations_b.cx AS cx_b, locations_b.cy AS cy_b, locations_b.cz AS cz_b, locations_b.lon AS lon_b, locations_b.lat AS lat_b INTO [rudolf].[dbo].[mention_mutual_all_COLD] FROM ([rudolf].[dbo].[mention_mutual_count_COLD] as graph JOIN locations AS locations_a ON graph.run_id = locations_a.run_id AND graph.user_a_id = locations_a.USER_ID) JOIN locations AS locations_b ON graph.run_id = locations_b.run_id AND graph.user_b_id = locations_b.USER_ID /* *************************************************** * 5.2.2. Távolságeloszlás alakjának s és r függése. * *****************************************************/ /* atob és btoa számok hisztogramja */ SELECT w, COUNT(*) as cnt FROM ( (SELECT atob as w FROM [rudolf].[dbo].[mention_mutual_all] WHERE (cx_a <> 1) AND (cx_b <> 1)) UNION ALL (SELECT btoa as w FROM [rudolf].[dbo].[mention_mutual_all] WHERE (cx_a <> 1) AND (cx_b <> 1)) ) s GROUP BY w ORDER BY w /* ******************************************************************************* * 5.4.1. Egyszerű kvantitatív összehasonlítási mértékek, kis kvalitatív elemzés * ********************************************************************************* /* Mindkét gráfban résztvevő felhasználók listázása (1 058 457 user)*/ SELECT DISTINCT a.user_id /*INTO [rudolf].[dbo].[users_intersection]*/ FROM ( SELECT user_a_is as user_id FROM [rudolf].[dbo].[mention_mutual_all] UNION ALL SELECT user_b_id as user_id FROM [rudolf].[dbo].[mention_mutual_all] ) a JOIN ( SELECT user_a_id as user_id FROM [rudolf].[dbo].[follow_mutual_graph] UNION ALL SELECT user_b_id as user_id FROM [rudolf].[dbo].[follow_mutual_graph] ) b ON a.user_id = b.user_id /* ************************************** * 5.4.2. Metszet részletesebb elemzése * ****************************************/ /* A follower gráfból azok kiválasztása, akik a mentionban is benne vannak, az él irányítássára való tekintet nélkül rendezve */ SELECT follower.f_user_id as user_a_id, follower.followed_user_id as user_b_id INTO [rudolf].[dbo].[follower_mention_intersection] FROM [jszule].[dbo].[follower_2dir_coord_graph_only] as follower JOIN [rudolf].[dbo].[mutual_mention_locations_table] as mention ON (follower.f_user_id = mention.user_a_id AND follower.followed_user_id = mention.user_b_id) OR (follower.f_user_id = mention.user_b_id AND follower.followed_user_id = mention.user_a_id) /* Follower gráfból kiszedni a kölcsönös followereket */ SELECT DISTINCT a.f_user_id AS user_a_id, b.f_user_id AS user_b_id INTO [rudolf].[dbo].[follow_mutual_graph] FROM [jszule].[dbo].[follower_2dir_coord_graph_only] AS acount INNER JOIN [jszule].[dbo].[follower_2dir_coord_graph_only] AS b ON a.f_user_id = b.followed_user_id AND a.followed_user_id = b.f_user_id WHERE a.f_user_id < b.f_user_id /* Az előzőhöz fokszám joinolása mindkét gráf szerint */ /* Először a fokszámok: (3 278 469 user)*/ SELECT user_id, COUNT(*) deg /*INTO [rudolf].[dbo].[follow_user_degree]*/ FROM ( SELECT user_a_id as user_id FROM [rudolf].[dbo].[follow_mutual_graph] UNION ALL SELECT user_b_id as user_id FROM [rudolf].[dbo].[follow_mutual_graph] ) s GROUP BY user_id /* ÉS (1 554 203 user)*/ SELECT run_id, user_id, COUNT(*) deg /*INTO [rudolf].[dbo].[mention_user_degree]*/ FROM ( SELECT user_a_is as user_id FROM [rudolf].[dbo].[mention_mutual_all] UNION ALL SELECT user_b_id as user_id FROM [rudolf].[dbo].[mention_mutual_all] ) s GROUP BY run_id, user_id /* Végül maga a JOIN */ SELECT inter.user_id, follow.deg follow_deg, mention.deg mention_deg /*INTO [rudolf].[dbo].[users_intersection_degree]*/ FROM [rudolf].[dbo].[users_intersection] inter JOIN [rudolf].[dbo].[follow_user_degree] follow ON inter.user_id = follow.user_id JOIN [rudolf].[dbo].[mention_user_degree] mention ON inter.user_id = mention.user_id /* Tweetszám megszámolása userekhez (kb. 17 perc, 140 156 456 user)*/ SELECT [run_id] ,[user_id] ,COUNT(*) tweet_count INTO [rudolf].[dbo].[user_tweet_count] FROM [Twitter].[dbo].[tweet] GROUP BY run_id, user_id /* Fokszámok átlagolása */ SELECT [follow_deg] , AVG([mention_deg]) avg_mention_deg FROM [rudolf].[dbo].[users_intersection_degree] GROUP BY follow_deg ORDER BY follow_deg /* Fokszámok átlagolása */ SELECT [mention_deg] , AVG([follow_deg]) avg_follow_deg FROM [rudolf].[dbo].[users_intersection_degree] GROUP BY [mention_deg] ORDER BY [mention_deg] /* A metszetbeli fokszámok újragenerálása a tweet countokkal együtt */ SELECT inter.user_id, follow.deg follow_deg, mention.deg mention_deg, tweet.tweet_count tweet_count /*INTO [rudolf].[dbo].[users_intersection_degree]*/ FROM [rudolf].[dbo].[users_intersection] inter JOIN [rudolf].[dbo].[follow_user_degree] follow ON inter.user_id = follow.user_id JOIN [rudolf].[dbo].[mention_user_degree] mention ON inter.user_id = mention.user_id JOIN [rudolf].[dbo].[user_tweet_count] tweet ON inter.user_id = tweet.user_id WHERE tweet.run_id = 1004 /* Tweetek szama a follow baratok szamanak fuggvenyeben felhasznalonkent */ SELECT follow_deg, tweet_count FROM [rudolf].[dbo].[users_intersection_degree] ORDER BY follow_deg, tweet_count /* Tweetek szama a mention baratok szamanak fuggvenyeben felhasznalonkent */ SELECT mention_deg, tweet_count FROM [rudolf].[dbo].[users_intersection_degree] ORDER BY mention_deg, tweet_count /* Tweetek atlagos szama a follow baratok szamanak fuggvenyeben */ SELECT follow_deg, AVG(tweet_count) as avg_tweet_count FROM [rudolf].[dbo].[users_intersection_degree] GROUP BY follow_deg ORDER BY follow_deg /* Tweetek atlagos szama a mention baratok szamanak fuggvenyeben */ SELECT mention_deg, AVG(tweet_count) as avg_tweet_count FROM [rudolf].[dbo].[users_intersection_degree] GROUP BY mention_deg ORDER BY mention_deg /* Hisztogram a tweetekről (az 1 helyere mast is irhatunk)*/ SELECT tweet_count, COUNT(*) as cnt FROM (SELECT tweet_count FROM [rudolf].[dbo].[users_intersection_degree] WHERE follow_deg = 1) a GROUP BY tweet_count ORDER BY tweet_count /* Említési gráf leszűkítése a metszetre 1 018 332*/ SELECT a.* INTO [rudolf].[dbo].[mention_mutual_intersection_all] FROM [rudolf].[dbo].[mention_mutual_all] AS a JOIN [rudolf].[dbo].[users_intersection] as b ON a.user_a_is = b.user_id JOIN [rudolf].[dbo].[users_intersection] as c ON a.user_b_id = c.user_id /* Követési gráf leszűkítése a metszetre 8 209 525*/ SELECT a.* /* INTO [rudolf].[dbo].[follow_mutual_graph_intersection] */ FROM [rudolf].[dbo].[follow_mutual_graph] AS a JOIN [rudolf].[dbo].[users_intersection] as b ON a.user_a_id = b.user_id JOIN [rudolf].[dbo].[users_intersection] as c ON a.user_b_id = c.user_id /* *************************************************************** * 6.2. Hashtagek népszerűségének és lokalizáltságának izsgálata * *****************************************************************/ /* Egyutt emlitett hashtagek grafja */ SELECT a.run_id as run_id, a.tag AS tag_a, b.tag AS tag_b, a.tweet_id AS tweet_id, a.user_id AS user_id, a.created_at AS created_at FROM [Twitter_COLD].[dbo].[tweet_hashtag] AS a JOIN [Twitter_COLD].[dbo].[tweet_hashtag] AS b ON a.run_id = b.run_id AND a.tweet_id = b.tweet_id /* Koordináták hozzárendelése */ WITH tweet (run_id, tweet_id, created_at, utc_offset, USER_ID, lat, lon, cx, cy, cz) AS ( SELECT run_id, tweet_id, created_at, utc_offset, USER_ID, lat, lon, cx, cy, cz FROM [twitter_COLD].[dbo].[tweet] WHERE lon IS NOT NULL AND lat IS NOT NULL ) SELECT TOP 1000 tweet.[run_id] ,tweet.[tweet_id] ,hash.tag_a ,hash.tag_b ,tweet.[created_at] ,tweet.[utc_offset] ,tweet.[user_id] ,tweet.[lon] ,tweet.[lat] ,tweet.[cx] ,tweet.[cy] ,tweet.[cz] INTO [rudolf].[dbo].[hashtag_grap_coord] FROM tweet JOIN [rudolf].[dbo].[hashtag_graph] AS hash ON hash.run_id = tweet.run_id AND hash.tweet_id = tweet.tweet_id /* Hashtag párok lőfordulási gyakorisága */ SELECT run_id, tag_a, tag_b, COUNT(*) FROM [rudolf].[dbo].[hashtag_graph_coord] GROUP BY run_id, tag_a, tag_b /* Egyes hashtagek népszerűsége */ SELECT run_id, tag, COUNT(*) FROM [Twitter_COLD].[dbo].[tweet_hashtag] GROUP BY run_id, tag ORDER BY tag DESC /* A következő query először a 'tags' nevű részqueryben * * leszámolja a hashtagek előrofrulási gyakoriságát, majd * * az egyes hashtagek minden előrfordulásához koordinátát * * rendel, ha van, és mellé írja az előfordulási * * valószínűséget: */ WITH tags (run_id, tag, CNT) AS ( SELECT run_id, tag, COUNT(*) as CNT FROM [Twitter_1].[dbo].[tweet_hashtag] GROUP BY run_id, tag ) SELECT a.run_id, a.tag, a.created_at, a.tweet_id, b.cx, b.cy, b.cz, b.lat, b.lon, tags.CNT AS CNT INTO [rudolf].[dbo].[hashtag_coords_counts] FROM [Twitter_1].[dbo].[tweet_hashtag] AS a JOIN [Twitter_1].[dbo].[tweet] AS b ON a.run_id = b.run_id AND a.tweet_id = b.tweet_id JOIN tags ON a.run_id = tags.run_id AND tags.tag = a.tag WHERE lat IS NOT NULL AND lon IS NOT NULL /* Ezekhez leszámoltam az egyes hashtagek koordinátás * * tweetekben történő előfordulásainak számát (CNT_gt) is, * * ami az előző tábla tag-re való csoportosítással való * * számolást jelenti, majd hozzácsatoltam az eredetihez: */ WITH dummy (run_id, tag, CNT_gt) AS ( SELECT run_id, tag, COUNT(*) as CNT_gt FROM [rudolf].[dbo].[hashtag_coords_counts] GROUP BY run_id, tag ) SELECT main.run_id, main.tag, main.created_at, main.tweet_id, main.cx, main.cy, main.cz, main.lat, main.lon, main.CNT, dummy.CNT_gt INTO [rudolf].[dbo].[hashtag_coords_counts_2] FROM [rudolf].[dbo].[hashtag_coords_counts] AS main JOIN dummy ON dummy.tag = main.tag AND dummy.run_id = main.run_id /* Hashtagek gyakorisági hisztogramja */ SELECT CNT_gt, COUNT(*) as rank FROM ( SELECT DISTINCT tag, CNT_gt FROM [rudolf].[dbo].[hashtag_coords_counts_2] ) s GROUP BY CNT_gt ORDER BY CNT_gt /* Hashtagek átlagpozíciói */ SELECT tag, (sum(cx) / SQRT(sum(cx)*sum(cx) + sum(cy)*sum(cy) + sum(cz)*sum(cz))) as cx_avg, (sum(cy) / SQRT(sum(cx)*sum(cx) + sum(cy)*sum(cy) + sum(cz)*sum(cz))) as cy_avg, (sum(cz) / SQRT(sum(cx)*sum(cx) + sum(cy)*sum(cy) + sum(cz)*sum(cz))) as cz_avg, COUNT(*) as CNT_gt FROM rudolf.dbo.hashtag_coords_counts_2 GROUP BY tag /* Szórások kiszámolása az előző queryben létrehozott táblára támaszkodva */ SELECT a.tag, b.CNT_gt, SQRT(AVG( ( dkondor.dbo.fDistanceXyz(a.cx,a.cy,a.cz,b.cx_avg,b.cy_avg,b.cz_avg) * dkondor.dbo.fDistanceXyz(a.cx,a.cy,a.cz,b.cx_avg,b.cy_avg,b.cz_avg) ))) as sigma INTO rudolf.dbo.hashtag_sigmas FROM rudolf.dbo.hashtag_coords_counts_2 as a JOIN rudolf.dbo.hashtag_averages as b ON a.tag = b.tag GROUP BY a.tag, b.CNT_gt /* Együtt említett hashtagek gráfjának fokszámeloszlása */ SELECT rank, COUNT(*) as CNT FROM ((SELECT [tag_a] as tag, SUM(cnt) as rank FROM [rudolf].[dbo].[hashtag_graph_weights] GROUP BY tag_a) UNION (SELECT [tag_b] as tag, SUM(cnt) as rank FROM [rudolf].[dbo].[hashtag_graph_weights] GROUP BY tag_b)) a GROUP BY rank ORDER BY CNT DESC /* ******************************** * Egyéb (elavult / nem használt) * **********************************/ /* Egyes tweetek megszamolasa */ SELECT [tweet_id], COUNT(*) as tweetNum FROM [Twitter].[dbo].[tweet] GROUP BY tweet_id ORDER BY tweetNumc /* Retweetek eloszlasa */ SELECT retweet_count, COUNT(*) as number FROM [Twitter].[dbo].[tweet] WHERE run_id = 1001 GROUP BY retweet_count ORDER BY retweet_count /* Összes felhasználó kilistázása, akiknek van geolocation a tweetjeiben, felhasználók ** ** szerint group by-olva és a koordinátákat átlagolva. Mindenből csinálja, kb 8-9perc alatt lefut. ** ** Ezt lehet majd joinolni a mention_mutual view-vel, és abból távolságeloszlást számolni. */ SELECT [Twitter].[dbo].[tweet].user_id as user_id, AVG([Twitter].[dbo].[tweet].lon) as lon, AVG([Twitter].[dbo].[tweet].lat) as lat FROM [Twitter].[dbo].[tweet] WHERE lon <> 0 GROUP BY [Twitter].[dbo].[tweet].user_id Egyéb, elavult /* Ugyanezzel JOIN-olva a kölcsönös mention gráfot, megkapjuk a földrajzi pontpárokat: */ WITH locations (user_id, lon, lat) AS ( SELECT [Twitter].[dbo].[tweet].user_id as user_id, AVG([Twitter].[dbo].[tweet].lon) as lon, AVG([Twitter].[dbo].[tweet].lat) as lat FROM [Twitter].[dbo].[tweet] WHERE lon <> 0 GROUP BY [Twitter].[dbo].[tweet].user_id ) SELECT TOP 100 [Twitter].[dbo].[user_mention_mutual].user_a_id, [Twitter].[dbo].[user_mention_mutual].user_b_id, locations_a.lon as lon_a, locations_a.lat as lat_a, locations_b.lon as lon_b, locations_b.lat as lat_b FROM ([Twitter].[dbo].[user_mention_mutual] JOIN locations AS locations_a ON [Twitter].[dbo].[user_mention_mutual].user_a_id = locations_a.user_id) JOIN locations AS locations_b ON [Twitter].[dbo].[user_mention_mutual].user_b_id = locations_b.user_id