diff --git a/example/scalalib/spark/4-rdd-spark/build.mill b/example/scalalib/spark/4-rdd-spark/build.mill new file mode 100644 index 00000000000..a208105f78d --- /dev/null +++ b/example/scalalib/spark/4-rdd-spark/build.mill @@ -0,0 +1,63 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with ScalaModule { + def scalaVersion = "2.12.15" + def ivyDeps = Seq( + ivy"org.apache.spark::spark-core:3.5.4" + ) + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + + def prependShellScript = "" + + object test extends ScalaTests { + def ivyDeps = Seq(ivy"com.lihaoyi::utest:0.8.5") + def testFramework = "utest.runner.Framework" + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + } + +} + +/** Usage + +> ./mill run +... +Basic Transformations: +Squares: 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400 +Even Squares: 4, 16, 36, 64, 100, 144, 196, 256, 324, 400 +... +Key-Value Operations: +ReduceByKey: (0,63), (1,70), (2,77) +GroupByKey: (0,List(3, 6, 9, 12, 15, 18)), (1,List(1, 4, 7, 10, 13, 16, 19)), (2,List(2, 5, 8, 11, 14, 17, 20)) +... +Advanced Operations: +Consistent Sample: 5, 8, 17 +Distinct: 4, 1, 2, 3 +Union: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 +... +Final Counts: +Total numbers: 20 +Total squares: 20 +Total pairs: 20 +... + +> ./mill test +...foo.FooTests.Basic Transformations... +...foo.FooTests.Key-Value Operations... +...foo.FooTests.Advanced Transformations... +...foo.FooTests.Count Operations... +...Tests: 4, Passed: 4, Failed: 0... + +*/ diff --git a/example/scalalib/spark/4-rdd-spark/src/foo/Foo.scala b/example/scalalib/spark/4-rdd-spark/src/foo/Foo.scala new file mode 100644 index 00000000000..3961a4bf784 --- /dev/null +++ b/example/scalalib/spark/4-rdd-spark/src/foo/Foo.scala @@ -0,0 +1,52 @@ +package foo + +import org.apache.spark.{SparkConf, SparkContext} + +object Foo { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf() + .setAppName("Spark RDD Example") + .setMaster("local[*]") + val sc = new SparkContext(conf) + + try { + // 1. Basic RDD Creation & Transformations + val numbers = sc.parallelize(1 to 20) + val squares = numbers.map(x => x * x) + val evenSquares = squares.filter(_ % 2 == 0) + + // 2. Key-Value Pair Operations + val pairs = numbers.map(x => (x % 3, x)) + val reduced = pairs.reduceByKey(_ + _) + val grouped = pairs.groupByKey() + + // 3. Advanced Transformations + val consistentSample = numbers.sample(withReplacement = false, fraction = 0.25, seed = 42L) + val distinct = sc.parallelize(Seq(1, 1, 2, 3, 4, 4)).distinct() + val union = numbers.union(sc.parallelize(15 to 30)) + + // 4. Actions & Results + println("Basic Transformations:") + println(s"Squares: ${squares.collect().mkString(", ")}") + println(s"Even Squares: ${evenSquares.collect().mkString(", ")}") + + println("\nKey-Value Operations:") + println(s"ReduceByKey: ${reduced.collect().mkString(", ")}") + println(s"GroupByKey: ${grouped.mapValues(_.toList).collect().mkString(", ")}") + + println("\nAdvanced Operations:") + println(s"Consistent Sample: ${consistentSample.collect().mkString(", ")}") + println(s"Distinct: ${distinct.collect().mkString(", ")}") + println(s"Union: ${union.collect().mkString(", ")}") + + println("\nFinal Counts:") + println(s"Total numbers: ${numbers.count()}") + println(s"Total squares: ${squares.count()}") + println(s"Total pairs: ${pairs.count()}") + + } finally { + sc.stop() + } + } +} diff --git a/example/scalalib/spark/4-rdd-spark/test/src/FooTests.scala b/example/scalalib/spark/4-rdd-spark/test/src/FooTests.scala new file mode 100644 index 00000000000..d76c7390e27 --- /dev/null +++ b/example/scalalib/spark/4-rdd-spark/test/src/FooTests.scala @@ -0,0 +1,85 @@ +package foo + +import org.apache.spark.{SparkConf, SparkContext} +import utest._ + +object FooTests extends TestSuite { + // Initialize SparkContext for tests + val conf = new SparkConf() + .setAppName("Spark RDD Tests") + .setMaster("local[*]") + val sc = new SparkContext(conf) + + // Tests will run in parallel, so we need to ensure proper cleanup + override def utestAfterAll(): Unit = { + sc.stop() + } + + val tests = Tests { + test("Basic Transformations") { + val numbers = sc.parallelize(1 to 20) + val squares = numbers.map(x => x * x) + val evenSquares = squares.filter(_ % 2 == 0) + + // Test squares + assert(squares.collect().toList == List( + 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, + 121, 144, 169, 196, 225, 256, 289, 324, 361, 400 + )) + + // Test even squares + assert(evenSquares.collect().toList == List( + 4, 16, 36, 64, 100, 144, 196, 256, 324, 400 + )) + } + + test("Key-Value Operations") { + val numbers = sc.parallelize(1 to 20) + val pairs = numbers.map(x => (x % 3, x)) + val reduced = pairs.reduceByKey(_ + _) + val grouped = pairs.groupByKey() + + // Test reduceByKey + assert(reduced.collect().toMap == Map( + 0 -> 63, // 3+6+9+12+15+18 + 1 -> 70, // 1+4+7+10+13+16+19 + 2 -> 77 // 2+5+8+11+14+17+20 + )) + + // Test groupByKey + val groupedResult = grouped.mapValues(_.toList).collect().toMap + assert(groupedResult(0).sorted == List(3, 6, 9, 12, 15, 18)) + assert(groupedResult(1).sorted == List(1, 4, 7, 10, 13, 16, 19)) + assert(groupedResult(2).sorted == List(2, 5, 8, 11, 14, 17, 20)) + } + + test("Advanced Transformations") { + val numbers = sc.parallelize(1 to 20) + + // Test consistent sampling + val sample1 = numbers.sample(withReplacement = false, 0.25, 42L) + val sample2 = numbers.sample(withReplacement = false, 0.25, 42L) + assert(sample1.collect().toList == sample2.collect().toList) + + // Test distinct + val duplicates = sc.parallelize(Seq(1, 1, 2, 3, 4, 4)) + assert(duplicates.distinct().collect().toSet == Set(1, 2, 3, 4)) + + // Test union + val rdd1 = sc.parallelize(1 to 5) + val rdd2 = sc.parallelize(3 to 7) + assert(rdd1.union(rdd2).collect().toSet == Set(1, 2, 3, 4, 5, 6, 7)) + } + + test("Count Operations") { + val numbers = sc.parallelize(1 to 20) + val pairs = numbers.map(x => (x % 3, x)) + + assert(numbers.count() == 20) + assert(pairs.count() == 20) + + val emptyRDD = sc.parallelize(Seq.empty[Int]) + assert(emptyRDD.count() == 0) + } + } +} diff --git a/example/scalalib/spark/5-sql-analytics/build.mill b/example/scalalib/spark/5-sql-analytics/build.mill new file mode 100644 index 00000000000..74a274b5268 --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/build.mill @@ -0,0 +1,105 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with ScalaModule { + def scalaVersion = "2.12.15" + def ivyDeps = Seq( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4" + ) + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + + def prependShellScript = "" + + object test extends ScalaTests { + def ivyDeps = Seq(ivy"com.lihaoyi::utest:0.8.5") + def testFramework = "utest.runner.Framework" + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + } + +} + +/** Usage + +> ./mill run +... +Running analysis: top_products +Results for top_products: ++----------+------------+-----------+----------+ +|product_id|product_name|total_sales|sales_rank| ++----------+------------+-----------+----------+ +|P00008 |Product 8 |4912 |1 | +|P00016 |Product 16 |4312 |2 | +|P00019 |Product 19 |4203 |3 | +|P00017 |Product 17 |3995 |4 | +|P00020 |Product 20 |3158 |5 | +|P00013 |Product 13 |2946 |6 | +|P00007 |Product 7 |2845 |7 | +|P00003 |Product 3 |2744 |8 | +|P00006 |Product 6 |2725 |9 | +|P00005 |Product 5 |2543 |10 | +|P00018 |Product 18 |2394 |11 | +|P00004 |Product 4 |2361 |12 | +|P00009 |Product 9 |2171 |13 | +|P00002 |Product 2 |2088 |14 | +|P00012 |Product 12 |1784 |15 | +|P00011 |Product 11 |1605 |16 | +|P00014 |Product 14 |1144 |17 | +|P00010 |Product 10 |1010 |18 | +|P00001 |Product 1 |929 |19 | +|P00015 |Product 15 |887 |20 | ++----------+------------+-----------+----------+ +... +Running analysis: monthly_sales_trend +Results for monthly_sales_trend: ++----+-----+-------------+----------------+ +|year|month|monthly_sales|cumulative_sales| ++----+-----+-------------+----------------+ +|2024|2 |703 |703 | +|2024|3 |5241 |5944 | +|2024|4 |4741 |10685 | +|2024|5 |6162 |16847 | +|2024|6 |2136 |18983 | +|2024|7 |4009 |22992 | +|2024|8 |2527 |25519 | +|2024|9 |2886 |28405 | +|2024|10 |4997 |33402 | +|2024|11 |3487 |36889 | +|2024|12 |3219 |40108 | +|2025|1 |3597 |43705 | +|2025|2 |7051 |50756 | ++----+-----+-------------+----------------+ +... +Running analysis: sales_by_region +Results for sales_by_region: ++-------------+-----------+ +|region |total_sales| ++-------------+-----------+ +|International|15378 | +|West |10796 | +|North |9466 | +|East |8147 | +|South |6969 | ++-------------+-----------+ +... + +> ./mill test +...foo.FooTests.DataLoadingTest... +...foo.FooTests.SalesByRegionAnalysis... +...foo.FooTests.TopProductsAnalysis... +...foo.FooTests.MonthlyTrendAnalysis... +...Tests: 4, Passed: 4, Failed: 0... + +*/ diff --git a/example/scalalib/spark/5-sql-analytics/resources/data/sales.csv b/example/scalalib/spark/5-sql-analytics/resources/data/sales.csv new file mode 100644 index 00000000000..bebf7a3adb3 --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/resources/data/sales.csv @@ -0,0 +1,101 @@ +sale_id,date,product_id,product_name,amount,region,customer_id,quantity,payment_method +346dba11-01d7-4ac3-bead-a7ff818e95e9,2024-04-03,P00015,Product 15,887,North,7df258e0-a0b3-4bce-9bc7-884331ced640,10,Credit +2334498f-92b7-48c4-af9b-db6b6eeb2c67,2024-03-07,P00018,Product 18,210,International,9af7ed75-3422-4256-96d2-437ad32b10c8,6,Credit +01c63f78-f8f8-4009-b421-a2f38d29e502,2024-04-19,P00019,Product 19,916,North,629ff3f3-39c2-46e1-9d8f-48e38659f526,6,Online +bf3a00ab-aad0-4dd6-8604-b343a5571542,2024-09-06,P00010,Product 10,727,South,f43e65a8-c49e-4223-b57e-9401e60ab0a2,6,Cash +706b17b9-80dc-4e9a-8fa0-7281c0e73513,2024-06-17,P00008,Product 8,97,South,974f0021-b134-4083-9cc5-0649824e02b1,14,Online +0d6bec4c-e6d3-49d1-b5b1-97aadce3ade1,2025-02-07,P00013,Product 13,677,North,1feb1817-39f4-445c-8fdb-30725ede7099,19,Credit +904d804a-206e-4bbf-9862-69829a8dd577,2024-03-20,P00004,Product 4,848,South,1b5f383a-851b-405a-a812-ee97f37d0134,8,Online +c632ad2b-6bdf-4903-a64c-f389d652aa3c,2024-05-02,P00011,Product 11,565,North,bbb9ed42-cbda-47cd-b8b4-4ab3b1c51ff2,13,Credit +1607d1c6-1334-4298-98f4-9448b9644b10,2024-11-14,P00019,Product 19,669,International,f383e7f6-f966-4005-8474-865acc27fa72,15,Online +d630f1a5-de45-4131-85d4-64fb891b097f,2024-11-27,P00016,Product 16,264,East,70591c6b-7272-4c05-b149-5fd8bafb3d58,17,Debit +94f95fc9-d796-4497-8dd1-9d94c804bef4,2024-09-21,P00008,Product 8,836,South,95e4d617-2114-4800-8ea7-46d554d10120,8,Online +d2a6f386-92c8-4ed6-9a5b-33189f7fcbe9,2024-07-05,P00006,Product 6,941,International,8018254c-2c25-4ac3-9682-f1eb2de50861,2,Online +c294c880-4067-44ae-89bc-892cfc559882,2024-08-14,P00005,Product 5,228,South,b87be222-edb8-4100-9b3f-b011c5ca478f,15,Cash +82ec94c9-9bf1-421a-bc38-b95932c8728e,2025-02-17,P00014,Product 14,781,International,c8059c8f-ba66-48d2-a53e-ca88dfc1c028,3,Debit +c7b3c7b0-cc7d-443a-8af3-47925b4b634a,2024-07-12,P00014,Product 14,363,North,a3aee658-2676-44ff-b306-30d73628b0bd,12,Debit +14ba694a-16f4-46bf-a540-12e5dacfefe7,2024-05-12,P00012,Product 12,959,West,df2cc1c4-6dfc-4eb7-ba81-b15828270023,15,Debit +1268e793-caf0-40cb-8129-b821c8d36d6d,2024-06-23,P00016,Product 16,49,South,9856124c-e1be-4d5e-94f8-5fffd23cd9cd,14,Credit +df272f37-10de-44b4-b6f0-0801ff2fb465,2025-02-14,P00002,Product 2,862,West,d87c1c61-9306-4498-826f-c34c340857a3,9,Online +e40dd94d-1b62-4ef3-bb2a-a798bc2520ba,2024-02-26,P00020,Product 20,703,East,2c46f79d-9c55-4a22-94f6-15b8a1a9c1ef,9,Online +2c21aeb3-b16c-462f-9c8c-a5c1f3f6cc06,2024-06-10,P00017,Product 17,903,South,9b3f53b6-6811-43f7-b3f8-91f14580ad2a,2,Cash +5023c668-7638-4f08-818b-91837a5d73bf,2024-03-23,P00016,Product 16,85,International,b9eeb045-171c-4f80-9455-9fc6a01f7793,19,Online +5c1e414f-7127-42e9-8f03-5eb177ece965,2024-05-01,P00012,Product 12,567,West,cb9bd0cf-8721-48b4-9335-debae260204d,14,Online +8808f7fb-a648-4237-a067-3fcc92229832,2024-08-02,P00019,Product 19,174,West,bdc853f1-25ed-4599-a70f-a86326d6f798,17,Online +74348ea4-2133-434b-85c0-269bb5157b1f,2024-11-11,P00003,Product 3,122,North,6387a02f-cdc7-4f1b-8473-87b1fa1e5b1d,15,Cash +78aeb422-77b7-41d5-8aa3-128367029708,2024-04-28,P00009,Product 9,69,East,aeb6744b-22bf-480d-8e1f-dd0c65b509e0,18,Debit +b2f77af8-2c75-4a3f-81d6-418c5a8b2065,2024-03-18,P00004,Product 4,651,South,072e3d4c-c371-48eb-8b64-69241087de5c,9,Cash +3a791ba7-e08f-42c2-b45b-dbc8d85ee939,2024-03-27,P00005,Product 5,493,East,2a562e7d-83c6-439e-9543-3d46e6b83fd9,5,Online +cc188d61-fb9d-40b5-b26b-6ce645fda280,2025-02-18,P00001,Product 1,332,West,4acc6fa7-24fc-456c-b1b4-b43f1c0034c9,3,Online +bf328657-5538-49a1-844e-b59c300d128b,2024-07-13,P00020,Product 20,868,International,71c3a543-f06d-4493-a575-2c8c699a4b0e,20,Debit +0b958f0f-c595-4dc2-ba9d-876a7a663272,2025-02-11,P00009,Product 9,653,West,6175f23c-4804-4f2b-a9c3-1aefaef8def0,2,Cash +f2864b11-e08c-486e-a3e2-f7525a3461a3,2024-05-03,P00019,Product 19,993,South,9efeedae-84ec-4657-9e12-f3aa6d2e686a,5,Credit +37e7a8de-e8cb-4e56-a607-9ca1b567094c,2024-06-03,P00018,Product 18,464,West,b70e8475-9b12-4bae-98da-fae3e538d5d9,8,Cash +301efeea-c1ec-48ce-891b-9a53f5e909bf,2025-02-21,P00002,Product 2,392,North,2c9c3d02-20ac-42fd-9830-16af9ee4c9d8,9,Credit +089bf81d-fc77-4965-98de-fac2325d745b,2024-07-18,P00007,Product 7,472,North,24ba6a73-1b79-4faa-8dfd-2dea8c50f598,6,Debit +497e29b6-eb5c-406d-a78d-c19ce5e95e6f,2024-03-20,P00007,Product 7,295,East,379d5e0f-8352-40d3-b905-33afcf111f38,7,Credit +c28c4212-a045-4121-8e82-658aa5ff3850,2024-04-20,P00009,Product 9,493,West,659ad38d-120b-4885-bc9c-b39f6865c4eb,20,Credit +3cc0b5d3-e852-4711-bb6c-c15b2f280926,2024-12-16,P00016,Product 16,788,West,ef485116-2c09-4311-affa-5b0009bb759f,11,Cash +ff17cf63-4224-4b8d-838a-09f7dea54ecd,2024-10-30,P00018,Product 18,729,West,4e5e3344-3be1-4450-a7e5-5502dbcec873,1,Online +bcb5b005-2eb7-460a-99a0-14573a775c04,2024-04-23,P00005,Product 5,72,International,105b8436-7099-4c24-b33b-3dcf8ea42b90,20,Credit +b545807f-a887-4e4d-b2e8-75f4d5831aa7,2024-08-04,P00005,Product 5,910,North,73ef9e05-8f42-4d03-8736-3c2a6dcc366a,9,Cash +8bcd9858-155f-4914-a3e5-07617f8f8d95,2025-01-09,P00017,Product 17,765,West,34f2d331-1282-49ca-a025-b3435619b504,2,Cash +72af284d-2732-43c1-8a0e-02e681a387c4,2024-12-17,P00013,Product 13,842,North,a8de1b2d-44ee-40fc-ab26-ecc447695f45,16,Debit +c939729d-27d0-4018-a6d3-722467e53496,2024-10-01,P00004,Product 4,862,International,75925a01-aecd-4793-ab04-a1e40e7dce81,2,Credit +c46db782-2ebb-4ad8-9c21-142bb9c6a317,2025-02-01,P00011,Product 11,143,International,2972c812-c802-4aeb-a494-30abc7f212bf,17,Cash +c032cb9d-39ce-4c83-bc2e-b2996b41361a,2024-12-27,P00003,Product 3,446,East,377867f7-251f-4b58-bee0-253601a10592,1,Online +ae8f2713-a420-466c-aa3a-1122a3523844,2024-12-04,P00005,Product 5,383,East,d22a2c64-ebf1-47c3-928a-8d6f492fe5b6,5,Debit +aff3ef0d-16bc-4346-8f09-e81ece0d1956,2024-05-14,P00010,Product 10,283,North,92e82fb6-0170-4081-8131-5a5bdd078cf6,19,Debit +5de03fbf-65c2-475b-89ba-08ef8f398b35,2024-04-24,P00005,Product 5,69,North,ed849971-e772-4a8c-b9dc-4f7de71e6f85,20,Cash +55ba2eda-23cf-4573-89af-7427f861205e,2024-09-07,P00018,Product 18,95,South,17371ed9-5b55-4adf-9113-874e5a0b40f9,4,Credit +4d8f1233-cb91-499d-9a6a-ece50da17d33,2024-08-25,P00013,Product 13,850,International,38b29ea5-7008-4003-b9e0-d8793270e4d9,16,Debit +e59a663b-0854-4e13-b137-5afd2db4d8ef,2024-10-15,P00008,Product 8,787,East,236e29c8-83fa-4d57-ba64-85a4f816db5a,5,Credit +dd2f6285-dada-472b-b019-89db2caf0452,2024-12-15,P00020,Product 20,659,International,1f6dee26-305f-4fd4-b97c-c05a629b3760,1,Credit +2bbc035e-839a-41e2-91a0-31909d34ad81,2024-03-17,P00016,Product 16,461,International,fa29ab35-b6c8-4ba6-bb5f-a261c91374da,9,Debit +e05c1766-82b0-4922-9420-76b6e1639ce4,2024-11-16,P00008,Product 8,851,International,f8a31995-87e1-4f6f-96b7-e0b4057b6c70,9,Cash +7322b1ad-8a86-4010-9c43-555232e9ea3d,2024-05-25,P00009,Product 9,377,International,9f2aa199-49f5-4710-8cc4-36db75284fa6,3,Cash +fcbf56f9-b596-4305-8a01-54687b50f9c7,2025-01-02,P00009,Product 9,286,North,7bf9c3d5-4c36-4ab6-a2d8-4814b2a10f98,3,Debit +ca526b50-f1f4-4dd5-9356-2d0c7939dfdc,2024-03-18,P00020,Product 20,831,West,80c7333e-f891-4676-ae8c-22aa84a3cd47,19,Online +be2938c3-7555-4553-b6d8-aabfb41f62c7,2024-08-23,P00006,Product 6,365,West,1402ce41-55ab-48c5-96b5-397025b8d8b7,14,Debit +74d58264-010b-49c9-920f-2e189fcedaca,2024-09-25,P00011,Product 11,701,North,6bafb314-2a6b-4c41-b14a-9f57c054b05d,17,Cash +2aaa5efb-9c3f-41df-81fb-112d91f08c28,2025-01-11,P00003,Product 3,1000,West,bc2f91cc-083e-40eb-9d3a-60886407448c,14,Cash +ea0e5008-998f-4630-862c-ea586da0b650,2024-04-15,P00018,Product 18,368,International,81f4add5-8b5a-4ff5-9aef-ccbfbef0ca5a,13,Online +9cf73e47-44ec-46c5-8e76-c5f04319f072,2024-05-02,P00008,Product 8,193,North,cb69524f-2ae3-4829-85cd-0099e0fde953,18,Online +f90fc466-2781-40bc-857a-c1aa2d858dd3,2024-05-07,P00013,Product 13,577,North,6cee1792-f98b-40fd-8123-cca8ae4f1fcd,11,Online +2fb8392f-3bf8-4bfb-8b40-12c9b917004c,2024-03-26,P00007,Product 7,788,International,c6ebe566-73e4-470a-8276-2e41f5accb0a,1,Credit +1d328dfb-34e3-442e-bc77-1fdd33d7eef0,2025-02-21,P00001,Product 1,318,South,4da3f6dc-5a09-41b2-b9cc-fd4daafaa80e,9,Cash +de441a3a-9d15-42a0-8b4d-c5bafc88e2f9,2025-01-20,P00008,Product 8,190,West,bc8a5262-222a-4137-bfa8-99aba1e4caf5,2,Credit +8206ef72-08bf-4118-a683-d049077b1799,2024-05-18,P00007,Product 7,224,West,7e4cd392-1be9-4221-bcf8-5d56b1b98433,2,Online +d5cb396f-f0c2-4dea-a19c-d1cc1920190f,2024-10-30,P00007,Product 7,965,East,d7fa3de3-dcb1-4e5a-aa7b-c31c29f74500,12,Cash +a138b1d4-2598-4d0a-9e5e-e22c6d8d4c82,2025-01-15,P00017,Product 17,392,West,b4ab5ab4-47c4-4c2f-a1e2-e62efcd8a7fd,12,Online +71bc14af-cd71-4ee4-8d98-776bcf548704,2024-07-13,P00016,Product 16,928,International,631beee0-bc54-47df-b945-8dd10f0a504b,17,Cash +630dc4c0-d826-4f70-a2c7-0803084421a9,2024-04-17,P00005,Product 5,322,West,2fadb24d-1d69-4889-af1d-41ccb9019de8,2,Online +a1ba0ebf-6780-4e5d-8640-4bc04cea7ada,2024-03-04,P00006,Product 6,195,East,e4035acc-8ff8-4de0-9ab2-442be1dd0ba4,1,Cash +0be4b8dd-42a3-452a-b0dc-d67019cf3c63,2024-10-25,P00006,Product 6,500,East,18ff07a0-928c-4b2a-be46-e85500c4e155,7,Debit +5ab07631-5e32-44b5-b42d-7811f1024827,2024-06-11,P00008,Product 8,623,East,5bb2b9cd-1768-40d3-8254-ca1f1bf595f0,6,Credit +eee7d8b7-bb57-4269-8132-cf298b005ebd,2025-01-26,P00018,Product 18,139,International,ef7fc54b-5489-4b4f-bc3b-f393d10a5ac5,18,Online +8d5f573a-c785-40c5-9117-0cd4cb1d49b2,2024-07-05,P00005,Product 5,66,East,bc720103-14b2-4613-b372-f498159243ca,4,Online +efe5c50f-415a-4280-bab6-ea79b73421cf,2024-10-16,P00001,Product 1,279,West,16858dbe-139e-40ae-a2e6-895da09d2b97,8,Debit +80b80c55-834b-441b-b262-fd5cff3ff4d9,2024-12-16,P00007,Product 7,101,East,b28c1ed9-2563-4704-9725-72d0b5da5f3a,12,Online +e6f3c353-ebb0-4f80-9e0a-2ab99e4dbbcf,2025-02-14,P00012,Product 12,258,South,b2ff6f1f-372c-4677-a8a7-715f6aa7244f,4,Credit +feccb4a5-9f25-4ea0-b184-4d25e78ad16b,2024-07-04,P00019,Product 19,175,International,579ab5f8-db07-4580-82ac-adb0fa9e1043,3,Credit +a57049d9-c63b-4121-929e-6f9cf55c3e0f,2025-02-04,P00008,Product 8,426,North,a24a60dd-4080-4d4d-962d-237ba092c48c,13,Online +07c87915-ef24-42b0-8037-55915d6a8410,2024-05-10,P00019,Product 19,310,West,3f458936-4464-4bd4-b047-0c9e8b7d46d6,9,Debit +aa1de28b-681d-499f-855e-0f14302c1ccc,2024-05-22,P00008,Product 8,909,East,e48a1796-c372-4256-a755-c3b158ac37b3,11,Online +608aad0b-91fa-431d-ab64-33e4f91e670c,2024-11-14,P00017,Product 17,812,International,4090db41-adaf-4132-9a4a-86dbb44bddcc,18,Cash +38982678-606a-4514-a02d-54306ac42655,2024-10-20,P00003,Product 3,190,East,cf2d7a77-cf29-4581-868c-29214fc8dea2,14,Online +3db61347-cf67-4337-a74a-7b168dedc77c,2024-10-08,P00017,Product 17,149,International,be62be5c-c9fd-44be-abf7-8b62b4d96ba5,10,Credit +7f7686d9-590b-4f40-a7dc-7df220710769,2024-04-02,P00016,Product 16,912,International,bee4ca59-4005-493a-8075-ff6197073d0a,16,Online +ab3d5bfb-c94a-471e-9685-9f49a0dc60bc,2024-11-02,P00017,Product 17,769,East,72f88635-522a-461d-808f-afeb95d0befd,6,Cash +64497b2e-85d6-421e-a48c-cb61af2297c0,2025-01-04,P00016,Product 16,825,International,153cbc3e-1e58-438d-b1c2-0f546ff212a8,5,Cash +5ec85aff-1223-410e-bb32-2e21d91c9254,2024-10-10,P00019,Product 19,439,South,8b280a8a-d073-4362-af94-caf15627df77,18,Debit +6b63fd4a-1641-4fd6-a192-cee648bcae7d,2024-03-16,P00009,Product 9,293,North,a1b09f4a-67d2-4ebd-ac2a-80f5bedb12e8,20,Credit +4175ab4b-0fdd-449d-ab9e-e519b9bbce94,2025-02-14,P00003,Product 3,986,International,8b993ce8-beb1-4248-a347-2ab5de72a76f,19,Credit +6b4ed217-d819-4382-96ae-b68776e83959,2024-03-15,P00006,Product 6,91,North,c15c928c-35c3-48eb-991f-5b0a520e3900,2,Online +08cf8697-d809-4125-8e2d-1f57893fde4a,2025-02-05,P00002,Product 2,834,International,18989b5b-ba18-4965-b09a-7829bdadca10,12,Credit +6f78395e-d6dc-4d26-8ebf-edbe7fc6a351,2024-04-26,P00006,Product 6,633,International,f648d37e-3ec2-4706-9ac9-e08f81424601,18,Credit +a8e6f1e5-8958-4f55-a274-343dfefc6372,2024-09-21,P00019,Product 19,527,South,d433c1f1-c1e0-4fd5-847b-b4b85fbd63f1,20,Online +b993a87d-3a5d-49b6-839f-866abdc5a670,2025-02-20,P00018,Product 18,389,East,edadb17f-8a22-4583-89f2-2ac84652b066,17,Debit +12d425ee-267a-4357-8d71-51d623e0a4a1,2024-10-04,P00020,Product 20,97,West,af28685c-1afe-42ce-9864-fd4cddb96c78,11,Cash +ffa5a875-be1a-4b7f-9e6b-01a33cfa1101,2024-05-29,P00017,Product 17,205,North,08c5aba2-b0ed-427a-99a7-33b1da03026b,12,Debit +5c816f81-2b8b-49d4-8145-c539e91cc23b,2024-07-23,P00011,Product 11,196,North,cbbe74ac-7df6-41a9-843d-e58858a458a3,7,Debit diff --git a/example/scalalib/spark/5-sql-analytics/resources/queries/monthly_sales_trend.sql b/example/scalalib/spark/5-sql-analytics/resources/queries/monthly_sales_trend.sql new file mode 100644 index 00000000000..70c3e2d47ea --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/resources/queries/monthly_sales_trend.sql @@ -0,0 +1,8 @@ +SELECT + YEAR(date) AS year, + MONTH(date) AS month, + SUM(amount) AS monthly_sales, + SUM(SUM(amount)) OVER (ORDER BY YEAR(date), MONTH(date)) AS cumulative_sales +FROM sales +GROUP BY YEAR(date), MONTH(date) +ORDER BY year, month \ No newline at end of file diff --git a/example/scalalib/spark/5-sql-analytics/resources/queries/sales_by_region.sql b/example/scalalib/spark/5-sql-analytics/resources/queries/sales_by_region.sql new file mode 100644 index 00000000000..e308c9a6e37 --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/resources/queries/sales_by_region.sql @@ -0,0 +1,4 @@ +SELECT region, SUM(amount) AS total_sales +FROM sales +GROUP BY region +ORDER BY total_sales DESC \ No newline at end of file diff --git a/example/scalalib/spark/5-sql-analytics/resources/queries/top_products.sql b/example/scalalib/spark/5-sql-analytics/resources/queries/top_products.sql new file mode 100644 index 00000000000..a2f8f999926 --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/resources/queries/top_products.sql @@ -0,0 +1,9 @@ +WITH product_sales AS ( + SELECT product_id, product_name, SUM(amount) AS total_sales + FROM sales + GROUP BY product_id, product_name +) +SELECT product_id, product_name, total_sales, + RANK() OVER (ORDER BY total_sales DESC) AS sales_rank +FROM product_sales +ORDER BY sales_rank \ No newline at end of file diff --git a/example/scalalib/spark/5-sql-analytics/src/foo/Foo.scala b/example/scalalib/spark/5-sql-analytics/src/foo/Foo.scala new file mode 100644 index 00000000000..425c83f0fce --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/src/foo/Foo.scala @@ -0,0 +1,46 @@ +package foo + +import org.apache.spark.sql.SparkSession +import scala.io.Source +import java.io.File + +object Foo { + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("SQL Analytics") + .master("local[*]") + .getOrCreate() + + try { + // Load embedded CSV data + val df = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(getClass.getResource("/data/sales.csv").getPath) + + df.createOrReplaceTempView("sales") + + // Directory containing SQL queries + val queryDir = new File(getClass.getResource("/queries").getPath) + + if (queryDir.exists() && queryDir.isDirectory) { + val sqlFiles = queryDir.listFiles().filter(_.getName.endsWith(".sql")) + + for (file <- sqlFiles) { + val analysisName = file.getName.stripSuffix(".sql") + println(s"Running analysis: $analysisName") + + val query = Source.fromFile(file).mkString + val result = spark.sql(query) + + println(s"Results for $analysisName:") + result.show(truncate = false) + } + } else { + println("No queries folder found or it's empty.") + } + } finally { + spark.stop() + } + } +} diff --git a/example/scalalib/spark/5-sql-analytics/test/src/FooTests.scala b/example/scalalib/spark/5-sql-analytics/test/src/FooTests.scala new file mode 100644 index 00000000000..4a1dc727f4e --- /dev/null +++ b/example/scalalib/spark/5-sql-analytics/test/src/FooTests.scala @@ -0,0 +1,107 @@ +package foo + +import utest._ +import org.apache.spark.sql.SparkSession +import scala.io.Source + +object FooTests extends TestSuite { + val spark: SparkSession = SparkSession.builder() + .appName("SQL Analytics Tests") + .master("local[*]") + .getOrCreate() + + // Sample test data matching production schema + val testData: String = + """sale_id,date,product_id,product_name,amount,region,customer_id,quantity,payment_method + |1,2023-01-01,P001,Product 1,100.0,North,cust1,1,Credit + |2,2023-01-02,P002,Product 2,200.0,South,cust2,2,Debit + |3,2023-01-03,P001,Product 1,300.0,North,cust3,3,Cash + |4,2023-02-01,P003,Product 3,50.0,East,cust4,4,Online + |5,2023-02-02,P002,Product 2,150.0,South,cust5,5,Debit + """.stripMargin + + override def utestBeforeEach(path: Seq[String]): Unit = { + import spark.implicits._ + + // Create test DataFrame from CSV string + import spark.implicits._ + val df = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(spark.createDataset(testData.split("\n").toSeq)) + + df.createOrReplaceTempView("sales") + } + + override def tests: Tests = Tests { + test("DataLoadingTest") { + val df = spark.sql("SELECT * FROM sales") + + // Verify schema + val expectedColumns = Set( + "sale_id", + "date", + "product_id", + "product_name", + "amount", + "region", + "customer_id", + "quantity", + "payment_method" + ) + assert(df.columns.toSet == expectedColumns) + + // Verify row count + assert(df.count() == 5) + } + + test("SalesByRegionAnalysis") { + val query = Source.fromResource("queries/sales_by_region.sql").mkString + val result = spark.sql(query).collect() + + // Verify aggregation and sorting + val expected = Seq( + ("North", 400.0), + ("South", 350.0), + ("East", 50.0) + ) + + assert(result.map(r => (r.getString(0), r.getDouble(1))).toSeq == expected) + } + + test("TopProductsAnalysis") { + val query = Source.fromResource("queries/top_products.sql").mkString + val result = spark.sql(query).collect() + + // Verify ranking and totals + val expected = Seq( + ("P001", "Product 1", 400.0, 1), + ("P002", "Product 2", 350.0, 2), + ("P003", "Product 3", 50.0, 3) + ) + + assert(result.map { r => + (r.getString(0), r.getString(1), r.getDouble(2), r.getInt(3)) + }.toSeq == expected) + } + + test("MonthlyTrendAnalysis") { + val query = Source.fromResource("queries/monthly_sales_trend.sql").mkString + val result = spark.sql(query).collect() + + // Verify cumulative sums and monthly totals + val expected = Seq( + (2023, 1, 600.0, 600.0), + (2023, 2, 200.0, 800.0) + ) + + assert(result.map { r => + (r.getInt(0), r.getInt(1), r.getDouble(2), r.getDouble(3)) + }.toSeq == expected) + } + } + + override def utestAfterAll(): Unit = { + spark.stop() + } +} diff --git a/example/scalalib/spark/6-spark-streaming/build.mill b/example/scalalib/spark/6-spark-streaming/build.mill new file mode 100644 index 00000000000..0ea3bd531fb --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/build.mill @@ -0,0 +1,96 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with ScalaModule { + def scalaVersion = "2.12.15" + def ivyDeps = Seq( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4" + ) + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + + def prependShellScript = "" + + object test extends ScalaTests { + def ivyDeps = Seq(ivy"com.lihaoyi::utest:0.8.5") + def testFramework = "utest.runner.Framework" + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + } + +} + +/** Usage + +> ./mill run +... +Is this Streaming DataFrame : true +... +Schema of DataFrame +root + |-- Date: string (nullable = true) + |-- Open: double (nullable = true) + |-- High: double (nullable = true) + |-- Low: double (nullable = true) + |-- Close: double (nullable = true) + |-- Adjusted Close: double (nullable = true) + |-- Volume: double (nullable = true) + |-- Name: string (nullable = true) +... +------------------------------------------- +Batch: 0 +------------------------------------------- ++----------+------+------+------+------+--------------+-----------+----+ +|Date |Open |High |Low |Close |Adjusted Close|Volume |Name| ++----------+------+------+------+------+--------------+-----------+----+ +|2024-06-07|248.77|249.63|246.44|247.89|246.95 |8.4809996E7|AMZN| +|2024-06-06|247.63|248.85|247.1 |248.77|247.04 |7.4062169E7|AMZN| +|2024-06-05|240.65|249.11|239.52|247.63|249.06 |7.3945578E7|AMZN| ++----------+------+------+------+------+--------------+-----------+----+ +only showing top 3 rows +... +------------------------------------------- +Batch: 0 +------------------------------------------- ++----+-----+------+ +|Year|Name |Max | ++----+-----+------+ +|2024|TSLA |201.41| +|2024|GOOGL|170.26| +|2024|AAPL |192.28| ++----+-----+------+ +only showing top 3 rows +... +------------------------------------------- +Batch: 0 +------------------------------------------- ++----+-----+------+ +|Year|Name |Max | ++----+-----+------+ +|2024|TSLA |201.41| +|2024|GOOGL|170.26| +|2024|AAPL |192.28| ++----+-----+------+ +only showing top 3 rows +... + +> ./mill test +...foo.FooTests.createStreamingDF should return a streaming DataFrame... +...foo.FooTests.getFileName should extract the correct file name... +...foo.FooTests.append mode should output raw data... +...foo.FooTests.update mode should output aggregated data... +...foo.FooTests.complete mode should output aggregated data... +...Tests: 5, Passed: 5, Failed: 0... + +*/ diff --git a/example/scalalib/spark/6-spark-streaming/resources/data/AAPL_stock_data.csv b/example/scalalib/spark/6-spark-streaming/resources/data/AAPL_stock_data.csv new file mode 100644 index 00000000000..aaa42ba2cdd --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/resources/data/AAPL_stock_data.csv @@ -0,0 +1,51 @@ +Date,Open,High,Low,Close,Adjusted Close,Volume,Name +2024-06-07,191.75,192.27,188.46,189.69,190.54,76658609.0,Apple Inc. +2024-06-06,190.36,192.28,189.54,191.75,190.59,75533478.0,Apple Inc. +2024-06-05,189.22,190.69,188.9,190.36,190.33,50498863.0,Apple Inc. +2024-06-04,190.06,190.98,188.44,189.22,189.51,74850677.0,Apple Inc. +2024-06-03,187.6,190.56,186.05,190.06,190.07,99913224.0,Apple Inc. +2024-05-31,188.93,189.74,186.68,187.6,188.71,90869792.0,Apple Inc. +2024-05-30,184.24,189.45,183.77,188.93,189.43,71130343.0,Apple Inc. +2024-05-29,187.67,189.15,183.18,184.24,185.69,77460368.0,Apple Inc. +2024-05-28,186.78,188.03,186.71,187.67,187.27,93167705.0,Apple Inc. +2024-05-27,189.42,189.64,185.23,186.78,188.55,60192257.0,Apple Inc. +2024-05-24,189.04,191.4,188.53,189.42,188.45,61616410.0,Apple Inc. +2024-05-23,186.39,189.13,186.29,189.04,188.15,67882750.0,Apple Inc. +2024-05-22,189.25,189.55,185.82,186.39,185.89,74390485.0,Apple Inc. +2024-05-21,190.15,190.24,188.26,189.25,187.71,74190239.0,Apple Inc. +2024-05-20,186.35,190.98,184.16,190.15,190.41,74883825.0,Apple Inc. +2024-05-17,181.22,188.97,180.64,186.35,186.94,64716695.0,Apple Inc. +2024-05-16,176.38,182.79,174.98,181.22,182.49,83999533.0,Apple Inc. +2024-05-15,179.39,179.42,174.98,176.38,176.91,82474983.0,Apple Inc. +2024-05-14,181.47,183.01,178.48,179.39,180.27,84968898.0,Apple Inc. +2024-05-13,179.61,181.83,178.82,181.47,180.84,86635193.0,Apple Inc. +2024-05-10,182.84,183.48,178.85,179.61,178.66,51989173.0,Apple Inc. +2024-05-09,181.1,183.2,179.95,182.84,183.61,67465321.0,Apple Inc. +2024-05-08,180.07,182.1,178.9,181.1,180.1,92632984.0,Apple Inc. +2024-05-07,181.89,181.96,180.03,180.07,180.61,53886402.0,Apple Inc. +2024-05-06,178.45,183.25,178.15,181.89,182.59,68992383.0,Apple Inc. +2024-05-03,177.95,179.27,177.76,178.45,178.63,62240032.0,Apple Inc. +2024-05-02,178.25,179.45,175.49,177.95,176.75,81725526.0,Apple Inc. +2024-05-01,179.22,180.33,178.19,178.25,176.49,64650271.0,Apple Inc. +2024-04-30,181.59,181.77,179.19,179.22,179.81,93369428.0,Apple Inc. +2024-04-29,179.68,182.06,177.76,181.59,180.14,87890300.0,Apple Inc. +2024-04-26,180.66,181.64,179.61,179.68,177.92,89769056.0,Apple Inc. +2024-04-25,180.38,181.0,180.12,180.66,181.05,99742523.0,Apple Inc. +2024-04-24,179.26,180.69,179.08,180.38,181.81,95075554.0,Apple Inc. +2024-04-23,180.12,180.96,177.37,179.26,178.05,92058334.0,Apple Inc. +2024-04-22,182.09,183.02,179.72,180.12,178.62,75297197.0,Apple Inc. +2024-04-19,184.1,185.13,181.93,182.09,182.61,77097584.0,Apple Inc. +2024-04-18,185.84,186.46,183.3,184.1,185.13,61275612.0,Apple Inc. +2024-04-17,181.91,186.52,181.57,185.84,187.28,77674903.0,Apple Inc. +2024-04-16,176.87,182.3,176.45,181.91,183.36,96315618.0,Apple Inc. +2024-04-15,180.29,180.62,175.46,176.87,177.67,86481541.0,Apple Inc. +2024-04-12,177.56,182.05,177.32,180.29,179.75,86998516.0,Apple Inc. +2024-04-11,175.79,177.9,175.78,177.56,176.98,53451356.0,Apple Inc. +2024-04-10,171.78,176.85,171.52,175.79,176.26,55130397.0,Apple Inc. +2024-04-09,174.77,175.23,171.49,171.78,173.16,62707035.0,Apple Inc. +2024-04-08,176.93,177.65,173.31,174.77,176.16,93703467.0,Apple Inc. +2024-04-05,179.12,180.59,176.54,176.93,175.49,72487056.0,Apple Inc. +2024-04-04,177.35,179.42,177.03,179.12,178.19,70818992.0,Apple Inc. +2024-04-03,174.46,177.47,173.76,177.35,177.46,62580031.0,Apple Inc. +2024-04-02,169.78,174.63,168.81,174.46,174.65,87327394.0,Apple Inc. +2024-04-01,170.0,170.74,169.69,169.78,170.86,90820227.0,Apple Inc. diff --git a/example/scalalib/spark/6-spark-streaming/resources/data/AMZN_stock_data.csv b/example/scalalib/spark/6-spark-streaming/resources/data/AMZN_stock_data.csv new file mode 100644 index 00000000000..b4cdefc4ea7 --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/resources/data/AMZN_stock_data.csv @@ -0,0 +1,51 @@ +Date,Open,High,Low,Close,Adjusted Close,Volume,Name +2024-06-07,248.77,249.63,246.44,247.89,246.95,84809996.0,Amazon.com Inc. +2024-06-06,247.63,248.85,247.1,248.77,247.04,74062169.0,Amazon.com Inc. +2024-06-05,240.65,249.11,239.52,247.63,249.06,73945578.0,Amazon.com Inc. +2024-06-04,239.94,240.65,239.1,240.65,242.54,92019117.0,Amazon.com Inc. +2024-06-03,241.65,242.53,239.36,239.94,240.1,78688729.0,Amazon.com Inc. +2024-05-31,240.54,243.34,239.54,241.65,242.23,88142766.0,Amazon.com Inc. +2024-05-30,234.37,241.62,233.99,240.54,241.96,57395350.0,Amazon.com Inc. +2024-05-29,237.37,238.02,233.51,234.37,232.68,93922119.0,Amazon.com Inc. +2024-05-28,231.03,238.8,230.98,237.37,236.59,86940933.0,Amazon.com Inc. +2024-05-27,225.81,231.78,225.15,231.03,231.62,72620709.0,Amazon.com Inc. +2024-05-24,223.33,226.01,222.11,225.81,227.79,59451337.0,Amazon.com Inc. +2024-05-23,223.36,224.97,223.26,223.33,221.61,75554173.0,Amazon.com Inc. +2024-05-22,219.16,224.32,217.5,223.36,223.92,91758730.0,Amazon.com Inc. +2024-05-21,217.92,219.29,217.28,219.16,220.94,58357956.0,Amazon.com Inc. +2024-05-20,218.48,219.6,217.01,217.92,218.11,72674714.0,Amazon.com Inc. +2024-05-17,213.11,221.56,212.85,218.48,217.98,57726890.0,Amazon.com Inc. +2024-05-16,212.71,213.42,212.66,213.11,214.81,99282339.0,Amazon.com Inc. +2024-05-15,212.8,213.37,211.09,212.71,211.57,62920739.0,Amazon.com Inc. +2024-05-14,210.79,213.04,208.99,212.8,212.81,77695858.0,Amazon.com Inc. +2024-05-13,211.32,212.96,210.03,210.79,211.84,70154994.0,Amazon.com Inc. +2024-05-10,215.47,216.84,210.75,211.32,210.48,52244702.0,Amazon.com Inc. +2024-05-09,218.35,219.86,214.81,215.47,215.31,55525633.0,Amazon.com Inc. +2024-05-08,220.41,222.71,217.8,218.35,217.81,76873516.0,Amazon.com Inc. +2024-05-07,217.11,220.68,215.44,220.41,220.14,65043403.0,Amazon.com Inc. +2024-05-06,214.74,217.68,213.99,217.11,218.48,61618627.0,Amazon.com Inc. +2024-05-03,210.99,215.27,210.9,214.74,214.28,95103150.0,Amazon.com Inc. +2024-05-02,214.98,215.74,210.02,210.99,212.59,58514716.0,Amazon.com Inc. +2024-05-01,217.88,218.5,214.48,214.98,214.38,89910660.0,Amazon.com Inc. +2024-04-30,217.84,218.1,217.09,217.88,217.98,86982407.0,Amazon.com Inc. +2024-04-29,212.77,217.96,212.2,217.84,218.56,51805468.0,Amazon.com Inc. +2024-04-26,211.08,212.99,210.17,212.77,214.11,59828366.0,Amazon.com Inc. +2024-04-25,209.48,211.77,209.25,211.08,209.66,78614327.0,Amazon.com Inc. +2024-04-24,204.85,209.52,204.8,209.48,208.07,93842483.0,Amazon.com Inc. +2024-04-23,203.63,206.82,203.36,204.85,203.63,60688727.0,Amazon.com Inc. +2024-04-22,199.91,204.83,199.08,203.63,204.83,66643948.0,Amazon.com Inc. +2024-04-19,198.71,200.46,198.66,199.91,198.47,61606243.0,Amazon.com Inc. +2024-04-18,196.05,199.75,195.19,198.71,200.07,66413486.0,Amazon.com Inc. +2024-04-17,198.9,199.01,195.46,196.05,196.34,55397662.0,Amazon.com Inc. +2024-04-16,199.32,200.11,198.41,198.9,199.86,72468492.0,Amazon.com Inc. +2024-04-15,197.17,199.85,196.51,199.32,197.59,87612325.0,Amazon.com Inc. +2024-04-12,194.52,197.67,193.96,197.17,195.91,89367797.0,Amazon.com Inc. +2024-04-11,196.2,197.19,193.12,194.52,196.23,69908425.0,Amazon.com Inc. +2024-04-10,194.2,197.16,192.29,196.2,197.11,57583354.0,Amazon.com Inc. +2024-04-09,188.97,194.26,188.04,194.2,192.74,91259524.0,Amazon.com Inc. +2024-04-08,191.56,192.62,187.58,188.97,190.72,96982977.0,Amazon.com Inc. +2024-04-05,191.08,192.08,190.2,191.56,192.83,81738453.0,Amazon.com Inc. +2024-04-04,192.55,193.49,190.89,191.08,192.72,56097091.0,Amazon.com Inc. +2024-04-03,188.31,192.91,187.28,192.55,191.97,59803005.0,Amazon.com Inc. +2024-04-02,183.32,188.98,182.78,188.31,187.98,60750240.0,Amazon.com Inc. +2024-04-01,180.0,184.63,178.48,183.32,183.22,91108406.0,Amazon.com Inc. diff --git a/example/scalalib/spark/6-spark-streaming/resources/data/GOOGL_stock_data.csv b/example/scalalib/spark/6-spark-streaming/resources/data/GOOGL_stock_data.csv new file mode 100644 index 00000000000..9403b1821e6 --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/resources/data/GOOGL_stock_data.csv @@ -0,0 +1,51 @@ +Date,Open,High,Low,Close,Adjusted Close,Volume,Name +2024-06-07,164.22,164.94,162.47,162.68,163.52,87747492.0,Alphabet Inc. +2024-06-06,165.71,165.94,161.46,164.22,165.73,92942594.0,Alphabet Inc. +2024-06-05,166.73,167.63,164.38,165.71,164.34,98068795.0,Alphabet Inc. +2024-06-04,163.62,168.43,162.91,166.73,166.8,98356322.0,Alphabet Inc. +2024-06-03,164.35,164.41,162.7,163.62,164.24,72858903.0,Alphabet Inc. +2024-05-31,167.41,167.42,163.44,164.35,165.3,60822814.0,Alphabet Inc. +2024-05-30,169.4,170.02,166.77,167.41,168.96,90984077.0,Alphabet Inc. +2024-05-29,167.91,170.26,167.48,169.4,170.44,74728848.0,Alphabet Inc. +2024-05-28,169.19,170.17,167.59,167.91,167.28,66843366.0,Alphabet Inc. +2024-05-27,164.96,169.33,164.4,169.19,167.85,53395049.0,Alphabet Inc. +2024-05-24,168.05,169.93,162.38,164.96,163.33,79246906.0,Alphabet Inc. +2024-05-23,166.49,168.86,166.42,168.05,169.62,78348817.0,Alphabet Inc. +2024-05-22,162.28,166.64,162.28,166.49,164.83,58670178.0,Alphabet Inc. +2024-05-21,161.63,164.85,160.87,162.28,160.91,53433382.0,Alphabet Inc. +2024-05-20,161.1,162.25,160.97,161.63,160.26,84392435.0,Alphabet Inc. +2024-05-17,156.99,161.87,155.65,161.1,160.97,61457501.0,Alphabet Inc. +2024-05-16,153.44,159.17,152.9,156.99,157.3,95911262.0,Alphabet Inc. +2024-05-15,153.33,154.66,152.38,153.44,154.07,73468625.0,Alphabet Inc. +2024-05-14,155.55,156.5,152.7,153.33,153.06,79061846.0,Alphabet Inc. +2024-05-13,153.98,155.61,153.68,155.55,155.09,59913841.0,Alphabet Inc. +2024-05-10,151.05,154.9,150.83,153.98,153.5,75370439.0,Alphabet Inc. +2024-05-09,152.54,153.36,150.01,151.05,150.97,60828463.0,Alphabet Inc. +2024-05-08,149.95,152.76,148.75,152.54,152.91,71900146.0,Alphabet Inc. +2024-05-07,150.11,150.68,149.34,149.95,148.92,52035848.0,Alphabet Inc. +2024-05-06,148.57,150.35,146.94,150.11,151.48,92607220.0,Alphabet Inc. +2024-05-03,149.86,150.22,147.77,148.57,149.59,97878478.0,Alphabet Inc. +2024-05-02,150.53,154.09,149.77,149.86,150.88,61579149.0,Alphabet Inc. +2024-05-01,151.04,151.6,148.82,150.53,149.51,51524102.0,Alphabet Inc. +2024-04-30,147.8,152.99,146.46,151.04,151.16,89678201.0,Alphabet Inc. +2024-04-29,149.4,151.83,147.74,147.8,147.12,69156053.0,Alphabet Inc. +2024-04-26,151.17,152.17,149.07,149.4,149.35,73387092.0,Alphabet Inc. +2024-04-25,152.42,153.46,151.15,151.17,150.07,88360857.0,Alphabet Inc. +2024-04-24,154.57,155.73,151.45,152.42,153.81,87367702.0,Alphabet Inc. +2024-04-23,150.72,154.87,150.55,154.57,154.25,64488262.0,Alphabet Inc. +2024-04-22,147.47,152.18,147.0,150.72,152.07,93545615.0,Alphabet Inc. +2024-04-19,149.68,151.12,147.1,147.47,147.8,60523401.0,Alphabet Inc. +2024-04-18,152.14,154.26,148.43,149.68,150.35,85119150.0,Alphabet Inc. +2024-04-17,154.7,155.15,151.51,152.14,153.4,78172140.0,Alphabet Inc. +2024-04-16,154.9,155.23,153.68,154.7,154.5,70275688.0,Alphabet Inc. +2024-04-15,152.07,156.25,151.6,154.9,155.94,64143385.0,Alphabet Inc. +2024-04-12,150.19,153.69,150.09,152.07,150.76,74374842.0,Alphabet Inc. +2024-04-11,153.07,153.94,149.86,150.19,151.0,60563214.0,Alphabet Inc. +2024-04-10,150.14,153.98,149.64,153.07,151.82,75941259.0,Alphabet Inc. +2024-04-09,146.35,150.72,145.34,150.14,149.2,71689432.0,Alphabet Inc. +2024-04-08,148.56,149.69,145.6,146.35,147.69,73388869.0,Alphabet Inc. +2024-04-05,147.14,148.67,146.91,148.56,148.12,70515189.0,Alphabet Inc. +2024-04-04,144.78,148.77,143.82,147.14,146.91,72992356.0,Alphabet Inc. +2024-04-03,145.83,148.11,143.41,144.78,145.56,60880726.0,Alphabet Inc. +2024-04-02,148.04,148.06,145.36,145.83,145.55,88808656.0,Alphabet Inc. +2024-04-01,150.0,150.45,147.35,148.04,149.04,91601684.0,Alphabet Inc. diff --git a/example/scalalib/spark/6-spark-streaming/resources/data/MSFT_stock_data.csv b/example/scalalib/spark/6-spark-streaming/resources/data/MSFT_stock_data.csv new file mode 100644 index 00000000000..5debd011025 --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/resources/data/MSFT_stock_data.csv @@ -0,0 +1,51 @@ +Date,Open,High,Low,Close,Adjusted Close,Volume,Name +2024-06-07,362.21,368.54,362.0,367.91,371.49,80170134.0,Microsoft Corp. +2024-06-06,359.53,363.03,358.81,362.21,365.65,86750772.0,Microsoft Corp. +2024-06-05,362.24,362.55,358.12,359.53,356.49,79702914.0,Microsoft Corp. +2024-06-04,366.71,367.61,360.79,362.24,362.85,63298372.0,Microsoft Corp. +2024-06-03,370.91,371.61,366.48,366.71,365.63,51722320.0,Microsoft Corp. +2024-05-31,370.75,371.27,370.28,370.91,370.64,68100699.0,Microsoft Corp. +2024-05-30,365.34,371.6,364.83,370.75,368.43,72966789.0,Microsoft Corp. +2024-05-29,357.55,365.93,357.3,365.34,363.71,90804335.0,Microsoft Corp. +2024-05-28,347.76,358.04,347.52,357.55,358.41,83423744.0,Microsoft Corp. +2024-05-27,354.4,354.61,346.94,347.76,347.45,88019582.0,Microsoft Corp. +2024-05-24,360.19,360.82,353.84,354.4,357.17,86805730.0,Microsoft Corp. +2024-05-23,359.49,360.47,359.08,360.19,363.44,73930984.0,Microsoft Corp. +2024-05-22,361.83,362.42,359.17,359.49,357.92,54345760.0,Microsoft Corp. +2024-05-21,354.82,361.98,353.99,361.83,358.54,93824588.0,Microsoft Corp. +2024-05-20,356.19,358.14,354.67,354.82,357.66,78267184.0,Microsoft Corp. +2024-05-17,354.13,358.32,353.16,356.19,356.79,54463418.0,Microsoft Corp. +2024-05-16,344.22,355.37,343.54,354.13,355.03,55001045.0,Microsoft Corp. +2024-05-15,338.7,345.81,338.62,344.22,343.91,60155224.0,Microsoft Corp. +2024-05-14,334.72,339.36,333.83,338.7,340.02,75301054.0,Microsoft Corp. +2024-05-13,332.39,334.84,332.37,334.72,335.59,51635156.0,Microsoft Corp. +2024-05-10,330.22,332.77,329.59,332.39,332.54,57468810.0,Microsoft Corp. +2024-05-09,336.07,336.14,329.59,330.22,333.32,92055856.0,Microsoft Corp. +2024-05-08,341.44,342.04,335.28,336.07,336.43,53912942.0,Microsoft Corp. +2024-05-07,337.68,341.64,336.52,341.44,338.53,62460255.0,Microsoft Corp. +2024-05-06,340.82,341.28,336.46,337.68,341.01,99907857.0,Microsoft Corp. +2024-05-03,334.63,341.8,333.62,340.82,341.01,56285088.0,Microsoft Corp. +2024-05-02,332.59,334.64,330.5,334.63,337.25,71496823.0,Microsoft Corp. +2024-05-01,337.2,338.36,331.78,332.59,329.56,96136438.0,Microsoft Corp. +2024-04-30,337.87,339.3,336.6,337.2,340.36,86869124.0,Microsoft Corp. +2024-04-29,330.88,339.31,330.78,337.87,334.57,70241658.0,Microsoft Corp. +2024-04-26,336.99,337.12,330.75,330.88,333.94,78728561.0,Microsoft Corp. +2024-04-25,343.33,343.77,336.66,336.99,335.12,57136029.0,Microsoft Corp. +2024-04-24,340.09,344.76,339.78,343.33,345.86,96615351.0,Microsoft Corp. +2024-04-23,342.03,343.66,339.4,340.09,338.63,69857417.0,Microsoft Corp. +2024-04-22,339.6,343.27,339.58,342.03,341.17,67352494.0,Microsoft Corp. +2024-04-19,338.49,339.86,337.19,339.6,342.74,63046468.0,Microsoft Corp. +2024-04-18,344.77,345.79,337.68,338.49,339.06,79933823.0,Microsoft Corp. +2024-04-17,341.72,348.01,340.9,344.77,346.47,94620658.0,Microsoft Corp. +2024-04-16,347.03,347.09,340.62,341.72,342.74,63021573.0,Microsoft Corp. +2024-04-15,347.46,348.38,346.82,347.03,344.38,93109628.0,Microsoft Corp. +2024-04-12,348.54,349.31,347.26,347.46,344.81,70088365.0,Microsoft Corp. +2024-04-11,340.2,348.98,339.66,348.54,345.45,85154828.0,Microsoft Corp. +2024-04-10,333.6,340.91,333.01,340.2,339.99,77345820.0,Microsoft Corp. +2024-04-09,328.64,334.87,327.36,333.6,330.34,72903535.0,Microsoft Corp. +2024-04-08,324.58,329.12,324.47,328.64,328.59,88791628.0,Microsoft Corp. +2024-04-05,318.24,326.1,318.22,324.58,322.06,55562067.0,Microsoft Corp. +2024-04-04,316.64,319.28,316.03,318.24,320.83,71288948.0,Microsoft Corp. +2024-04-03,314.88,318.76,314.14,316.64,316.16,70814992.0,Microsoft Corp. +2024-04-02,307.01,315.8,306.45,314.88,315.25,54578238.0,Microsoft Corp. +2024-04-01,300.0,308.27,299.17,307.01,308.71,95791733.0,Microsoft Corp. diff --git a/example/scalalib/spark/6-spark-streaming/resources/data/TSLA_stock_data.csv b/example/scalalib/spark/6-spark-streaming/resources/data/TSLA_stock_data.csv new file mode 100644 index 00000000000..aa3bd0e2cd1 --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/resources/data/TSLA_stock_data.csv @@ -0,0 +1,51 @@ +Date,Open,High,Low,Close,Adjusted Close,Volume,Name +2024-06-07,198.46,201.41,197.53,200.46,199.3,76312911.0,Tesla Inc. +2024-06-06,193.77,198.93,192.61,198.46,198.39,68003027.0,Tesla Inc. +2024-06-05,188.29,194.25,187.72,193.77,191.91,69628490.0,Tesla Inc. +2024-06-04,186.53,188.93,184.68,188.29,188.56,58546310.0,Tesla Inc. +2024-06-03,187.86,187.92,186.18,186.53,185.05,50100734.0,Tesla Inc. +2024-05-31,184.19,188.96,183.71,187.86,186.53,59986298.0,Tesla Inc. +2024-05-30,179.21,184.83,178.63,184.19,185.11,62382318.0,Tesla Inc. +2024-05-29,181.71,182.12,178.74,179.21,180.01,98641341.0,Tesla Inc. +2024-05-28,181.78,183.8,180.83,181.71,183.19,59331795.0,Tesla Inc. +2024-05-27,184.46,185.01,181.24,181.78,183.43,70843837.0,Tesla Inc. +2024-05-24,187.8,188.66,183.3,184.46,185.41,51308507.0,Tesla Inc. +2024-05-23,183.12,187.95,183.04,187.8,188.39,97081137.0,Tesla Inc. +2024-05-22,184.25,185.29,181.9,183.12,183.94,92307399.0,Tesla Inc. +2024-05-21,182.53,184.3,182.42,184.25,186.0,99796359.0,Tesla Inc. +2024-05-20,179.69,182.57,179.24,182.53,182.15,75492849.0,Tesla Inc. +2024-05-17,175.72,180.24,174.58,179.69,180.34,58421928.0,Tesla Inc. +2024-05-16,176.63,176.73,174.33,175.72,175.59,93148703.0,Tesla Inc. +2024-05-15,176.03,176.86,174.33,176.63,175.39,65329502.0,Tesla Inc. +2024-05-14,179.59,180.57,175.29,176.03,177.74,71888358.0,Tesla Inc. +2024-05-13,176.66,180.5,174.64,179.59,179.35,81050824.0,Tesla Inc. +2024-05-10,175.58,176.86,175.54,176.66,176.98,54409352.0,Tesla Inc. +2024-05-09,176.14,176.65,174.89,175.58,174.97,51443717.0,Tesla Inc. +2024-05-08,178.9,178.93,175.17,176.14,176.68,74035962.0,Tesla Inc. +2024-05-07,178.92,179.52,178.15,178.9,177.69,64391234.0,Tesla Inc. +2024-05-06,180.07,180.79,178.29,178.92,177.94,99092554.0,Tesla Inc. +2024-05-03,178.46,181.57,177.94,180.07,180.28,96462256.0,Tesla Inc. +2024-05-02,177.75,179.59,177.07,178.46,177.56,98243574.0,Tesla Inc. +2024-05-01,181.03,181.16,176.98,177.75,178.13,92664041.0,Tesla Inc. +2024-04-30,183.89,184.58,180.63,181.03,181.39,88400142.0,Tesla Inc. +2024-04-29,187.14,188.52,182.71,183.89,182.06,65941617.0,Tesla Inc. +2024-04-26,190.87,191.82,186.91,187.14,187.0,74455289.0,Tesla Inc. +2024-04-25,194.07,194.16,189.69,190.87,189.67,58439469.0,Tesla Inc. +2024-04-24,189.04,194.08,188.85,194.07,195.89,71650278.0,Tesla Inc. +2024-04-23,189.19,189.36,188.82,189.04,190.6,87436581.0,Tesla Inc. +2024-04-22,191.24,192.95,189.04,189.19,188.41,83138881.0,Tesla Inc. +2024-04-19,195.14,195.25,190.25,191.24,192.38,91208934.0,Tesla Inc. +2024-04-18,189.55,197.6,188.95,195.14,193.26,77050901.0,Tesla Inc. +2024-04-17,188.96,190.18,188.71,189.55,189.66,51947763.0,Tesla Inc. +2024-04-16,188.8,190.46,188.18,188.96,188.79,58388512.0,Tesla Inc. +2024-04-15,184.4,188.89,183.62,188.8,187.49,68827165.0,Tesla Inc. +2024-04-12,184.16,185.6,183.81,184.4,182.9,79536204.0,Tesla Inc. +2024-04-11,183.06,184.67,181.49,184.16,183.06,82356072.0,Tesla Inc. +2024-04-10,180.02,183.16,179.38,183.06,181.81,91839392.0,Tesla Inc. +2024-04-09,177.69,180.43,177.15,180.02,180.17,62851424.0,Tesla Inc. +2024-04-08,176.47,178.19,175.91,177.69,177.59,58962049.0,Tesla Inc. +2024-04-05,172.98,177.09,172.93,176.47,176.69,92590048.0,Tesla Inc. +2024-04-04,171.5,173.31,171.34,172.98,173.72,66052465.0,Tesla Inc. +2024-04-03,171.46,172.63,171.26,171.5,172.76,91965471.0,Tesla Inc. +2024-04-02,171.11,171.51,170.48,171.46,170.43,93721639.0,Tesla Inc. +2024-04-01,170.0,171.2,169.68,171.11,171.95,59734816.0,Tesla Inc. diff --git a/example/scalalib/spark/6-spark-streaming/src/foo/Foo.scala b/example/scalalib/spark/6-spark-streaming/src/foo/Foo.scala new file mode 100644 index 00000000000..ad153947635 --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/src/foo/Foo.scala @@ -0,0 +1,102 @@ +package foo + +import org.apache.spark.sql.{SparkSession, DataFrame} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.Column +import org.apache.spark.sql.types._ +import org.apache.spark.sql.streaming.Trigger + +object Foo { + // Define schema + val schema: StructType = StructType(List( + StructField("Date", StringType, true), + StructField("Open", DoubleType, true), + StructField("High", DoubleType, true), + StructField("Low", DoubleType, true), + StructField("Close", DoubleType, true), + StructField("Adjusted Close", DoubleType, true), + StructField("Volume", DoubleType, true) + )) + + // Function to extract file name + def getFileName: Column = { + val file_name = reverse(split(input_file_name(), "/")).getItem(0) + split(file_name, "_").getItem(0) + } + + // Function to create streaming DataFrame + def createStreamingDF(spark: SparkSession): DataFrame = { + spark.readStream + .format("csv") + .option("maxFilesPerTrigger", 2) + .option("header", true) + .schema(schema) + .load(spark.getClass.getResource("/data").getPath) + .withColumn("Name", getFileName) + } + + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("Spark Streaming") + .master("local[*]") + .getOrCreate() + + spark.sparkContext.setLogLevel("ERROR") + + // Check if DataFrame is streaming + println("Is this Streaming DataFrame : " + createStreamingDF(spark).isStreaming) + + // Print Schema + println("Schema of DataFrame") + createStreamingDF(spark).printSchema() + + // Append Mode + val appendDF = createStreamingDF(spark) + val appendQuery = appendDF.writeStream + .outputMode("append") + .format("console") + .option("truncate", false) + .option("numRows", 3) + .trigger(Trigger.Once()) + .start() + appendQuery.awaitTermination() + + // Update Mode + val updateDF = createStreamingDF(spark) + updateDF.createOrReplaceTempView("updateView") + val updateResult = spark.sql( + """SELECT Year(Date) AS Year, Name, MAX(High) AS Max + |FROM updateView + |GROUP BY Name, Year""".stripMargin + ) + + val updateQuery = updateResult.writeStream + .outputMode("update") + .format("console") + .option("truncate", false) + .option("numRows", 3) + .trigger(Trigger.Once()) + .start() + updateQuery.awaitTermination() + + // Complete Mode + val completeDF = createStreamingDF(spark) + completeDF.createOrReplaceTempView("completeView") + val completeResult = spark.sql( + """SELECT Year(Date) AS Year, Name, MAX(High) AS Max + |FROM completeView + |GROUP BY Name, Year""".stripMargin + ) + + val completeQuery = completeResult.writeStream + .outputMode("complete") + .format("console") + .option("truncate", false) + .option("numRows", 3) + .trigger(Trigger.Once()) + .start() + completeQuery.awaitTermination() + + spark.stop() + } +} diff --git a/example/scalalib/spark/6-spark-streaming/test/src/FooTests.scala b/example/scalalib/spark/6-spark-streaming/test/src/FooTests.scala new file mode 100644 index 00000000000..3875da746ad --- /dev/null +++ b/example/scalalib/spark/6-spark-streaming/test/src/FooTests.scala @@ -0,0 +1,96 @@ +package foo + +import utest._ +import org.apache.spark.sql.{SparkSession, DataFrame} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.Encoders + +object FooTests extends TestSuite { + val spark: SparkSession = SparkSession.builder() + .appName("Spark Streaming Tests") + .master("local[*]") + .getOrCreate() + + import spark.implicits._ // Import Spark implicits for Dataset operations + + spark.sparkContext.setLogLevel("ERROR") + + override def utestAfterAll(): Unit = { + spark.stop() + } + + val tests = Tests { + test("createStreamingDF should return a streaming DataFrame") { + val df = Foo.createStreamingDF(spark) + assert(df.isStreaming) + } + + test("getFileName should extract the correct file name") { + val testPath = getClass.getResource("/data/GOOGL_stock_data.csv").getPath + val expected = "GOOGL" + val result = Foo.getFileName + val df = spark.read.schema(Foo.schema).csv(testPath).withColumn("Name", result) + val actual = df.select("Name").as[String].head() + assert(actual == expected) + } + + test("append mode should output raw data") { + val df = Foo.createStreamingDF(spark) + val query = df.writeStream + .outputMode("append") + .format("memory") + .queryName("appendOutput") + .trigger(Trigger.Once()) + .start() + query.awaitTermination() + + val result = spark.sql("SELECT * FROM appendOutput") + assert(result.count() > 0) + } + + test("update mode should output aggregated data") { + val df = Foo.createStreamingDF(spark) + df.createOrReplaceTempView("updateView") + val aggregated = spark.sql( + """SELECT Year(Date) AS Year, Name, MAX(High) AS Max + |FROM updateView + |GROUP BY Name, Year""".stripMargin + ) + + val query = aggregated.writeStream + .outputMode("update") + .format("memory") + .queryName("updateOutput") + .trigger(Trigger.Once()) + .start() + query.awaitTermination() + + val result = spark.sql("SELECT * FROM updateOutput") + assert(result.count() > 0) + } + + test("complete mode should output aggregated data") { + val df = Foo.createStreamingDF(spark) + df.createOrReplaceTempView("completeView") + val aggregated = spark.sql( + """SELECT Year(Date) AS Year, Name, MAX(High) AS Max + |FROM completeView + |GROUP BY Name, Year""".stripMargin + ) + + val query = aggregated.writeStream + .outputMode("complete") + .format("memory") + .queryName("completeOutput") + .trigger(Trigger.Once()) + .start() + query.awaitTermination() + + val result = spark.sql("SELECT * FROM completeOutput") + assert(result.count() > 0) + } + } +} diff --git a/example/scalalib/spark/7-spark-mllib/build.mill b/example/scalalib/spark/7-spark-mllib/build.mill new file mode 100644 index 00000000000..9918ed098fb --- /dev/null +++ b/example/scalalib/spark/7-spark-mllib/build.mill @@ -0,0 +1,57 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with ScalaModule { + def scalaVersion = "2.12.15" + def ivyDeps = Seq( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4", + ivy"org.apache.spark:spark-mllib_2.12:3.5.4" + ) + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + + def prependShellScript = "" + + object test extends ScalaTests { + def ivyDeps = Seq(ivy"com.lihaoyi::utest:0.8.5") + def testFramework = "utest.runner.Framework" + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + } + +} + +/** Usage + +> ./mill run +... +VALIDATION: Starting Spark MLlib Pipeline +VALIDATION: Spark session initialized successfully +VALIDATION: Loading data from path: .../resources/data.csv... +VALIDATION: Training new model +VALIDATION: Model accuracy = ... +VALIDATION: Model AUC = ... +VALIDATION: Model persistence validated at ./out/dt_model +VALIDATION: Spark session closed +... + +> ./mill test +...foo.FooTests.dataLoading... +...foo.FooTests.pipelineConstruction... +...foo.FooTests.modelTraining... +...foo.FooTests.modelPersistence... +...foo.FooTests.evaluationMetrics... +...Tests: 5, Passed: 5, Failed: 0... + +*/ diff --git a/example/scalalib/spark/7-spark-mllib/resources/data.csv b/example/scalalib/spark/7-spark-mllib/resources/data.csv new file mode 100644 index 00000000000..c3ec3063742 --- /dev/null +++ b/example/scalalib/spark/7-spark-mllib/resources/data.csv @@ -0,0 +1,501 @@ +color,price,label +red,14.7,0 +green,48.01,1 +red,25.46,0 +red,26.38,0 +red,23.18,0 +blue,16.77,0 +green,41.77,1 +red,42.72,1 +yellow,34.91,0 +red,38.78,1 +red,24.26,0 +yellow,15.21,0 +yellow,47.12,0 +green,43.53,1 +red,28.91,0 +red,49.59,1 +red,47.82,1 +green,49.39,1 +blue,42.31,0 +red,27.45,0 +green,12.03,0 +blue,36.42,0 +green,22.66,0 +blue,37.93,0 +red,49.33,1 +yellow,43.24,0 +blue,20.07,0 +blue,35.75,0 +yellow,14.18,0 +yellow,17.73,0 +blue,38.28,0 +green,45.91,1 +green,23.5,0 +red,33.55,1 +green,43.74,1 +blue,18.21,0 +red,46.67,1 +yellow,38.48,0 +yellow,23.4,0 +green,39.65,1 +red,32.29,1 +yellow,45.74,0 +red,40.95,1 +red,42.1,1 +red,42.81,1 +yellow,19.66,0 +green,22.42,0 +red,36.62,1 +green,38.04,1 +blue,11.88,0 +red,46.88,1 +green,14.79,0 +yellow,15.51,0 +red,17.82,0 +blue,33.59,0 +yellow,46.02,0 +blue,38.22,0 +red,45.76,1 +blue,12.31,0 +red,42.47,1 +yellow,18.73,0 +blue,12.51,0 +yellow,49.92,0 +blue,45.41,0 +red,44.16,1 +green,18.62,0 +yellow,49.11,0 +green,24.56,0 +red,39.08,1 +red,32.05,1 +yellow,29.76,0 +yellow,43.72,0 +green,31.52,1 +green,36.94,1 +yellow,18.86,0 +yellow,34.68,0 +blue,44.91,0 +blue,46.77,0 +yellow,16.48,0 +red,13.26,0 +yellow,20.7,0 +yellow,28.28,0 +yellow,32.36,0 +green,27.42,0 +blue,48.75,0 +red,46.11,1 +red,36.59,1 +blue,22.91,0 +green,10.36,0 +red,22.69,0 +yellow,23.48,0 +yellow,44.7,0 +green,23.99,0 +yellow,25.36,0 +blue,44.95,0 +yellow,23.23,0 +yellow,45.03,0 +yellow,42.91,0 +red,45.36,1 +yellow,42.25,0 +blue,16.98,0 +green,21.68,0 +yellow,23.01,0 +red,15.98,0 +yellow,30.37,0 +blue,33.17,0 +red,13.05,0 +blue,38.89,0 +yellow,26.49,0 +green,27.82,0 +blue,37.81,0 +yellow,12.45,0 +green,23.98,0 +red,49.49,1 +red,33.46,1 +blue,46.28,0 +blue,39.55,0 +blue,31.08,0 +yellow,15.55,0 +blue,49.24,0 +green,33.01,1 +green,20.16,0 +red,30.55,1 +red,21.71,0 +green,23.04,0 +red,23.23,0 +blue,40.42,0 +yellow,10.03,0 +red,27.42,0 +green,45.38,1 +yellow,15.79,0 +blue,37.44,0 +yellow,19.5,0 +blue,18.61,0 +green,11.5,0 +blue,17.51,0 +green,47.47,1 +blue,41.87,0 +red,13.31,0 +yellow,15.45,0 +yellow,37.73,0 +yellow,36.02,0 +yellow,41.57,0 +yellow,36.03,0 +blue,34.89,0 +green,42.08,1 +red,45.82,1 +red,43.36,1 +green,24.61,0 +blue,15.88,0 +red,40.6,1 +yellow,40.81,0 +yellow,27.55,0 +yellow,29.71,0 +blue,23.19,0 +blue,34.89,0 +green,27.83,0 +red,44.51,1 +yellow,23.93,0 +yellow,41.88,0 +blue,44.59,0 +yellow,10.53,0 +red,13.15,0 +yellow,27.58,0 +red,45.24,1 +green,15.8,0 +green,38.86,1 +blue,38.75,0 +yellow,41.27,0 +yellow,18.99,0 +red,41.32,1 +yellow,46.56,0 +blue,28.92,0 +green,45.76,1 +red,49.09,1 +blue,17.3,0 +yellow,49.05,0 +red,33.64,1 +yellow,45.53,0 +yellow,25.56,0 +red,20.53,0 +green,32.9,1 +blue,45.64,0 +blue,36.23,0 +green,36.83,1 +green,25.84,0 +green,29.51,0 +yellow,21.19,0 +blue,31.83,0 +yellow,14.03,0 +green,29.47,0 +green,46.97,1 +blue,27.26,0 +green,10.08,0 +blue,39.3,0 +yellow,23.67,0 +red,44.02,1 +green,16.77,0 +red,19.77,0 +blue,27.89,0 +yellow,35.84,0 +blue,20.3,0 +green,13.42,0 +yellow,33.74,0 +blue,34.34,0 +green,26.96,0 +green,26.15,0 +red,33.23,1 +yellow,43.96,0 +green,47.55,1 +yellow,10.46,0 +red,13.32,0 +yellow,48.74,0 +red,47.34,1 +green,28.91,0 +blue,29.0,0 +yellow,21.56,0 +blue,37.27,0 +blue,12.24,0 +yellow,31.18,0 +blue,45.91,0 +green,28.94,0 +red,47.86,1 +blue,39.91,0 +blue,10.09,0 +blue,23.4,0 +blue,34.52,0 +red,42.02,1 +blue,21.41,0 +green,13.44,0 +blue,32.37,0 +yellow,31.06,0 +yellow,30.07,0 +green,14.46,0 +blue,12.19,0 +yellow,23.4,0 +yellow,41.7,0 +blue,23.6,0 +yellow,13.88,0 +green,25.9,0 +red,24.37,0 +green,11.52,0 +green,25.45,0 +red,22.46,0 +blue,34.76,0 +green,11.01,0 +red,44.69,1 +yellow,27.42,0 +blue,20.37,0 +red,30.0,0 +red,15.51,0 +red,42.61,1 +yellow,34.2,0 +yellow,20.73,0 +red,34.63,1 +green,25.87,0 +red,45.53,1 +blue,25.97,0 +yellow,39.89,0 +green,34.54,1 +green,30.96,1 +blue,14.81,0 +red,39.3,1 +blue,20.98,0 +blue,32.54,0 +yellow,45.39,0 +yellow,28.59,0 +red,38.61,1 +yellow,27.26,0 +green,11.44,0 +blue,13.77,0 +green,25.3,0 +blue,23.22,0 +blue,49.25,0 +green,12.38,0 +blue,21.2,0 +blue,28.13,0 +red,34.36,1 +yellow,49.26,0 +green,12.99,0 +yellow,24.35,0 +yellow,45.65,0 +blue,49.28,0 +green,35.45,1 +green,30.03,1 +blue,17.99,0 +yellow,42.04,0 +blue,17.49,0 +green,13.26,0 +yellow,11.23,0 +green,40.44,1 +green,23.22,0 +blue,29.39,0 +blue,30.38,0 +yellow,27.25,0 +blue,30.38,0 +green,38.7,1 +blue,22.97,0 +red,20.79,0 +blue,16.51,0 +yellow,44.13,0 +red,33.42,1 +yellow,25.04,0 +blue,19.47,0 +red,33.31,1 +red,15.47,0 +red,26.5,0 +blue,34.4,0 +yellow,31.0,0 +blue,27.22,0 +red,27.54,0 +green,12.19,0 +red,26.99,0 +yellow,11.84,0 +blue,18.29,0 +green,18.46,0 +blue,38.64,0 +green,32.57,1 +green,21.78,0 +yellow,30.02,0 +red,29.99,0 +yellow,41.31,0 +blue,48.64,0 +green,30.03,1 +green,47.06,1 +yellow,14.16,0 +red,36.06,1 +green,31.06,1 +red,26.62,0 +blue,40.31,0 +blue,29.65,0 +green,24.32,0 +green,10.9,0 +red,43.17,1 +green,35.6,1 +red,10.72,0 +red,12.49,0 +yellow,15.8,0 +yellow,30.51,0 +yellow,11.33,0 +blue,26.28,0 +green,17.75,0 +yellow,11.8,0 +yellow,10.17,0 +red,45.29,1 +yellow,15.39,0 +red,34.81,1 +yellow,17.44,0 +red,34.73,1 +yellow,30.81,0 +blue,25.4,0 +green,25.02,0 +yellow,42.73,0 +green,26.02,0 +green,13.32,0 +green,10.68,0 +green,40.43,1 +blue,28.86,0 +yellow,26.48,0 +green,40.61,1 +blue,48.91,0 +green,13.29,0 +yellow,41.57,0 +yellow,25.94,0 +red,49.93,1 +yellow,10.43,0 +red,15.44,0 +blue,13.1,0 +red,30.45,1 +red,33.88,1 +yellow,32.72,0 +red,24.62,0 +red,32.64,1 +green,49.63,1 +yellow,20.89,0 +blue,16.11,0 +red,34.21,1 +yellow,20.65,0 +red,14.96,0 +red,19.2,0 +yellow,16.65,0 +green,32.24,1 +red,37.86,1 +green,13.4,0 +red,28.81,0 +blue,41.29,0 +yellow,42.65,0 +green,22.6,0 +blue,11.17,0 +red,34.29,1 +red,47.76,1 +blue,38.84,0 +green,14.17,0 +yellow,26.61,0 +red,21.27,0 +red,30.16,1 +green,46.17,1 +yellow,32.81,0 +yellow,11.17,0 +red,37.8,1 +green,12.25,0 +red,22.27,0 +blue,20.81,0 +yellow,17.62,0 +yellow,22.44,0 +yellow,11.58,0 +blue,19.33,0 +green,39.0,1 +red,40.32,1 +green,11.62,0 +red,46.85,1 +blue,24.22,0 +green,30.01,1 +blue,47.95,0 +green,47.66,1 +yellow,23.82,0 +green,28.35,0 +yellow,37.41,0 +yellow,45.75,0 +blue,37.36,0 +yellow,23.71,0 +blue,23.7,0 +red,15.34,0 +blue,19.7,0 +green,33.09,1 +blue,24.65,0 +green,25.28,0 +yellow,44.32,0 +blue,46.15,0 +blue,17.09,0 +green,38.62,1 +blue,43.13,0 +red,20.46,0 +yellow,24.02,0 +yellow,25.25,0 +blue,39.3,0 +red,40.51,1 +green,43.95,1 +green,24.54,0 +yellow,47.19,0 +red,47.19,1 +yellow,18.4,0 +blue,38.87,0 +red,38.55,1 +blue,40.55,0 +yellow,46.73,0 +green,43.3,1 +yellow,26.33,0 +green,10.9,0 +yellow,40.15,0 +yellow,11.39,0 +green,18.41,0 +yellow,14.07,0 +yellow,22.18,0 +green,19.85,0 +red,25.6,0 +green,48.78,1 +yellow,15.0,0 +yellow,17.78,0 +green,11.14,0 +red,32.15,1 +blue,16.98,0 +green,31.36,1 +blue,37.01,0 +blue,27.02,0 +red,27.95,0 +yellow,15.07,0 +blue,19.28,0 +red,34.31,1 +blue,28.55,0 +red,34.59,1 +blue,39.41,0 +red,19.93,0 +green,39.34,1 +blue,26.58,0 +red,40.07,1 +green,31.79,1 +blue,28.95,0 +red,17.67,0 +green,48.2,1 +yellow,34.7,0 +green,27.52,0 +blue,37.79,0 +green,38.66,1 +yellow,39.47,0 +red,22.83,0 +red,17.48,0 +green,42.84,1 +yellow,14.36,0 +blue,49.75,0 +red,46.56,1 +blue,16.59,0 +yellow,47.1,0 +green,37.17,1 +green,49.85,1 +green,20.01,0 +red,21.78,0 +green,11.18,0 +green,25.68,0 +blue,15.46,0 diff --git a/example/scalalib/spark/7-spark-mllib/src/foo/Foo.scala b/example/scalalib/spark/7-spark-mllib/src/foo/Foo.scala new file mode 100644 index 00000000000..ae38fb890d7 --- /dev/null +++ b/example/scalalib/spark/7-spark-mllib/src/foo/Foo.scala @@ -0,0 +1,99 @@ +package foo + +import org.apache.spark.sql.{SparkSession, DataFrame} +import org.apache.spark.sql.functions.col +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.feature._ +import org.apache.spark.ml.classification.DecisionTreeClassifier +import org.apache.spark.ml.evaluation.{ + BinaryClassificationEvaluator, + MulticlassClassificationEvaluator +} + +object Foo { + val ModelPath = "./out/dt_model" + val DataPath = "/data.csv" + + def main(args: Array[String]): Unit = { + println("VALIDATION: Starting Spark MLlib Pipeline") + + val spark = SparkSession.builder() + .appName("Spark MLlib") + .master("local[*]") + .getOrCreate() + + try { + runPipeline(spark) + } finally { + spark.stop() + println("VALIDATION: Spark session closed") + } + } + + def runPipeline(spark: SparkSession): Unit = { + println("VALIDATION: Spark session initialized successfully") + + val data = loadData(spark) + val model = trainOrLoadModel(data) + val Array(_, testData) = data.randomSplit(Array(0.7, 0.3)) + val predictions = model.transform(testData) + + evaluateModel(predictions) + validateModelPersistency(model) + } + + def loadData(spark: SparkSession): DataFrame = { + val dataPath = getClass.getResource(DataPath).getPath + println(s"VALIDATION: Loading data from path: $dataPath") + + spark.read + .option("header", "true") + .csv(dataPath) + .select( + col("color"), + col("price").cast("double"), + col("label").cast("double") + ) + } + + def trainOrLoadModel(data: DataFrame): PipelineModel = { + try { + PipelineModel.load(ModelPath) + } catch { + case _: Exception => + println("VALIDATION: Training new model") + val model = createPipeline().fit(data) + model.write.overwrite().save(ModelPath) + model + } + } + + def createPipeline(): Pipeline = { + val stages = Array( + new StringIndexer().setInputCol("color").setOutputCol("colorIndex"), + new OneHotEncoder().setInputCols(Array("colorIndex")).setOutputCols(Array("colorVec")), + new VectorAssembler().setInputCols(Array("colorVec", "price")).setOutputCol("features"), + new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("features").setMaxDepth(5) + ) + new Pipeline().setStages(stages) + } + + def evaluateModel(predictions: DataFrame): Unit = { + val accuracy = new MulticlassClassificationEvaluator() + .setMetricName("accuracy") + .evaluate(predictions) + + val auc = new BinaryClassificationEvaluator() + .setMetricName("areaUnderROC") + .evaluate(predictions) + + println(s"VALIDATION: Model accuracy = $accuracy") + println(s"VALIDATION: Model AUC = $auc") + } + + def validateModelPersistency(model: PipelineModel): Unit = { + model.write.overwrite().save(ModelPath) + val reloadedModel = PipelineModel.load(ModelPath) + println(s"VALIDATION: Model persistence validated at $ModelPath") + } +} diff --git a/example/scalalib/spark/7-spark-mllib/test/src/FooTests.scala b/example/scalalib/spark/7-spark-mllib/test/src/FooTests.scala new file mode 100644 index 00000000000..bd85a60cc20 --- /dev/null +++ b/example/scalalib/spark/7-spark-mllib/test/src/FooTests.scala @@ -0,0 +1,67 @@ +package foo + +import utest._ +import org.apache.spark.sql.{SparkSession, DataFrame} +import org.apache.spark.ml.PipelineModel +import org.apache.spark.ml.classification.DecisionTreeClassifier +import org.apache.spark.sql.functions.col +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator + +object FooTests extends TestSuite { + val spark = SparkSession.builder() + .master("local[*]") + .appName("Spark MLlib Tests") + .getOrCreate() + + val tests = Tests { + test("dataLoading") { + val data = Foo.loadData(spark) + assert(data.count() > 0) + assert(data.columns.sorted.toSeq == Seq("color", "label", "price").sorted) + } + + test("pipelineConstruction") { + val pipeline = Foo.createPipeline() + assert(pipeline.getStages.length == 4) + assert(pipeline.getStages.last.isInstanceOf[DecisionTreeClassifier]) + } + + test("modelTraining") { + val data = Foo.loadData(spark) + val model = Foo.trainOrLoadModel(data) + assert(model.stages.length == 4) + assert(model.transform(data.limit(10)).columns.contains("prediction")) + } + + test("modelPersistence") { + val model = PipelineModel.load(Foo.ModelPath) + val testData = spark.createDataFrame(Seq( + ("red", 25.0, 0.0), + ("green", 35.0, 1.0) + )).toDF("color", "price", "label") + + val predictions = model.transform(testData) + assert(predictions.select("prediction").count() == 2) + } + + test("evaluationMetrics") { + val data = Foo.loadData(spark) + val model = PipelineModel.load(Foo.ModelPath) + val predictions = model.transform(data.limit(10)) + + val accuracy = evaluateModel(predictions) + assert(accuracy >= 0.0 && accuracy <= 1.0) + } + } + + override def utestAfterAll(): Unit = { + spark.stop() + } + + // Fixed helper method + private def evaluateModel(predictions: DataFrame): Double = { + new MulticlassClassificationEvaluator() + .setMetricName("accuracy") + .evaluate(predictions) + } +} diff --git a/example/scalalib/spark/8-spark-graphx/build.mill b/example/scalalib/spark/8-spark-graphx/build.mill new file mode 100644 index 00000000000..d1fd4b726ac --- /dev/null +++ b/example/scalalib/spark/8-spark-graphx/build.mill @@ -0,0 +1,82 @@ +package build +import mill._, scalalib._ + +object `package` extends RootModule with ScalaModule { + def scalaVersion = "2.12.15" + def ivyDeps = Seq( + ivy"org.apache.spark::spark-core:3.5.4", + ivy"org.apache.spark::spark-sql:3.5.4", + ivy"org.apache.spark:spark-graphx_2.12:3.5.4" + ) + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + + def prependShellScript = "" + + object test extends ScalaTests { + def ivyDeps = Seq(ivy"com.lihaoyi::utest:0.8.5") + def testFramework = "utest.runner.Framework" + + def forkArgs = Seq( + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" + ) + } + +} + +/** Usage + +> ./mill run +... +Triangle Counting Results: ++--------+-------------+ +| Name|TriangleCount| ++--------+-------------+ +| Contact| 0| +| Blog| 0| +| About| 1| +| Home| 1| +|Products| 1| +| FAQ| 0| ++--------+-------------+ +... +PageRank Results: ++--------+--------+ +| Name|PageRank| ++--------+--------+ +| Contact| 0.9077| +| Blog| 0.9077| +| About| 0.5468| +| Home| 0.9334| +|Products| 1.7828| +| FAQ| 0.9215| ++--------+--------+ +... +Connected Components Results: ++--------+---------+ +| Name|Component| ++--------+---------+ +| Contact| 1| +| Blog| 1| +| About| 1| +| Home| 1| +|Products| 1| +| FAQ| 1| ++--------+---------+ +... + +> ./mill test +...foo.FooTests.loadVertices should load vertices correctly... +...foo.FooTests.loadEdges should load edges correctly... +...foo.FooTests.analyzeGraph should compute triangles, ranks, and components correctly... +...Tests: 3, Passed: 3, Failed: 0... + +*/ diff --git a/example/scalalib/spark/8-spark-graphx/resources/data/edges.txt b/example/scalalib/spark/8-spark-graphx/resources/data/edges.txt new file mode 100644 index 00000000000..9ae7508ad38 --- /dev/null +++ b/example/scalalib/spark/8-spark-graphx/resources/data/edges.txt @@ -0,0 +1,8 @@ +1,2,1 +1,3,1 +2,3,1 +3,4,1 +4,5,1 +5,1,1 +3,6,1 +6,3,1 \ No newline at end of file diff --git a/example/scalalib/spark/8-spark-graphx/resources/data/vertices.txt b/example/scalalib/spark/8-spark-graphx/resources/data/vertices.txt new file mode 100644 index 00000000000..f8cf9d5b41a --- /dev/null +++ b/example/scalalib/spark/8-spark-graphx/resources/data/vertices.txt @@ -0,0 +1,6 @@ +1,Home +2,About +3,Products +4,Contact +5,FAQ +6,Blog \ No newline at end of file diff --git a/example/scalalib/spark/8-spark-graphx/src/foo/Foo.scala b/example/scalalib/spark/8-spark-graphx/src/foo/Foo.scala new file mode 100644 index 00000000000..e55175af82c --- /dev/null +++ b/example/scalalib/spark/8-spark-graphx/src/foo/Foo.scala @@ -0,0 +1,90 @@ +package foo + +import org.apache.spark._ +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + +object Foo { + type VertexRDD = RDD[(VertexId, String)] + type EdgeRDD = RDD[Edge[Int]] + + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("Spark GraphX") + .master("local[*]") + .getOrCreate() + val sc = spark.sparkContext + sc.setLogLevel("ERROR") + + // Load data + val vertices = loadVertices(sc, getClass.getResource("/data/vertices.txt").getPath) + val edges = loadEdges(sc, getClass.getResource("/data/edges.txt").getPath) + + // Analyze graph + val (triangles, ranks, components) = analyzeGraph(spark, vertices, edges) + + // Format and print results + printResults(spark, vertices, triangles, ranks, components) + + spark.stop() + } + + def loadVertices(sc: SparkContext, path: String): VertexRDD = { + sc.textFile(path).map { line => + val fields = line.split(",") + (fields(0).toLong, fields(1)) + } + } + + def loadEdges(sc: SparkContext, path: String): EdgeRDD = { + sc.textFile(path).map { line => + val fields = line.split(",") + Edge(fields(0).toLong, fields(1).toLong, fields(2).toInt) + } + } + + def analyzeGraph(spark: SparkSession, vertices: VertexRDD, edges: EdgeRDD) = { + import spark.implicits._ + + val graph = Graph(vertices, edges) + .partitionBy(PartitionStrategy.CanonicalRandomVertexCut) + + val triangles = graph.triangleCount().vertices + val ranks = graph.pageRank(0.0001).vertices + val components = graph.connectedComponents().vertices + + (triangles, ranks, components) + } + + def printResults( + spark: SparkSession, + vertices: VertexRDD, + triangles: RDD[(VertexId, Int)], + ranks: RDD[(VertexId, Double)], + components: RDD[(VertexId, VertexId)] + ) = { + import spark.implicits._ + + // Triangle Counting + val triangleDF = vertices.join(triangles) + .map { case (id, (name, count)) => (name, count) } + .toDF("Name", "TriangleCount") + println("Triangle Counting Results:") + triangleDF.show() + + // PageRank + val rankDF = vertices.join(ranks) + .map { case (id, (name, rank)) => (name, f"${rank}%.4f") } + .toDF("Name", "PageRank") + println("PageRank Results:") + rankDF.show() + + // Connected Components + val compDF = vertices.join(components) + .map { case (id, (name, comp)) => (name, comp) } + .toDF("Name", "Component") + println("Connected Components Results:") + compDF.show() + } +} diff --git a/example/scalalib/spark/8-spark-graphx/test/src/FooTests.scala b/example/scalalib/spark/8-spark-graphx/test/src/FooTests.scala new file mode 100644 index 00000000000..37c9fb47386 --- /dev/null +++ b/example/scalalib/spark/8-spark-graphx/test/src/FooTests.scala @@ -0,0 +1,67 @@ +package foo + +import utest._ +import org.apache.spark._ +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + +object FooTests extends TestSuite { + val spark: SparkSession = SparkSession.builder() + .appName("FooTests") + .master("local[*]") + .getOrCreate() + val sc: SparkContext = spark.sparkContext + sc.setLogLevel("ERROR") + + val tests = Tests { + test("loadVertices should load vertices correctly") { + val path = getClass.getResource("/data/vertices.txt").getPath + val vertices = Foo.loadVertices(sc, path) + val collected = vertices.collect() + assert(collected.length == 6) // 6 vertices in the test file + } + + test("loadEdges should load edges correctly") { + val path = getClass.getResource("/data/edges.txt").getPath + val edges = Foo.loadEdges(sc, path) + val collected = edges.collect() + assert(collected.length == 8) // 8 edges in the test file + } + + test("analyzeGraph should compute triangles, ranks, and components correctly") { + val verticesPath = getClass.getResource("/data/vertices.txt").getPath + val edgesPath = getClass.getResource("/data/edges.txt").getPath + val vertices = Foo.loadVertices(sc, verticesPath) + val edges = Foo.loadEdges(sc, edgesPath) + + val (triangles, ranks, components) = Foo.analyzeGraph(spark, vertices, edges) + + // Test triangle counts + val triangleCounts = triangles.collect() + assert(triangleCounts.length == 6) // 6 vertices + assert(triangleCounts.contains((1L, 1))) // Home has 1 triangle + assert(triangleCounts.contains((2L, 1))) // About has 1 triangle + assert(triangleCounts.contains((3L, 1))) // Products has 1 triangle + assert(triangleCounts.contains((4L, 0))) // Contact has 0 triangle + assert(triangleCounts.contains((5L, 0))) // FAQ has 0 triangle + assert(triangleCounts.contains((6L, 0))) // Blog has 0 triangle + + // Test PageRank + val pageRanks = ranks.collect() + assert(pageRanks.length == 6) // 6 vertices + assert(pageRanks.exists { case (_, rank) => rank > 0.0 }) // All ranks should be > 0 + + // Test connected components + val connectedComponents = components.collect() + assert(connectedComponents.length == 6) // 6 vertices + assert(connectedComponents.forall { case (_, component) => + component == 1L + }) // All in one component + } + } + + override def utestAfterAll(): Unit = { + spark.stop() + } +} diff --git a/website/docs/modules/ROOT/pages/scalalib/spark.adoc b/website/docs/modules/ROOT/pages/scalalib/spark.adoc index 0c8d30e834d..bcc3b79bf32 100644 --- a/website/docs/modules/ROOT/pages/scalalib/spark.adoc +++ b/website/docs/modules/ROOT/pages/scalalib/spark.adoc @@ -14,4 +14,24 @@ include::partial$example/scalalib/spark/1-hello-spark.adoc[] == Semi realistic spark project with spark submit -include::partial$example/scalalib/spark/3-semi-realistic.adoc[] \ No newline at end of file +include::partial$example/scalalib/spark/3-semi-realistic.adoc[] + +== Spark RDD Transformation & Action + +include::partial$example/scalalib/spark/4-rdd-spark.adoc[] + +== Spark SQL Analytics + +include::partial$example/scalalib/spark/5-sql-analytics.adoc[] + +== Spark Streaming + +include::partial$example/scalalib/spark/6-spark-streaming.adoc[] + +== Spark MLlib + +include::partial$example/scalalib/spark/7-spark-mllib.adoc[] + +== Spark GraphX + +include::partial$example/scalalib/spark/8-spark-graphx.adoc[] \ No newline at end of file