diff --git a/.gitignore b/.gitignore index 8312df086d..1b78d53126 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ _site *.swp .sass-cache *~ +_posts/2023-03-29-Ksql_and_Kafka_Connect_101.md +assets/blog-images/ksql_connect_blog \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 52ff7c4619..89132d463e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,72 +1,100 @@ GEM remote: https://rubygems.org/ specs: - addressable (2.6.0) - public_suffix (>= 2.0.2, < 4.0) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) colorator (1.1.0) - concurrent-ruby (1.1.4) - em-websocket (0.5.1) + concurrent-ruby (1.3.3) + em-websocket (0.5.3) eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) + http_parser.rb (~> 0) eventmachine (1.2.7) - eventmachine (1.2.7-x64-mingw32) - ffi (1.10.0) - ffi (1.10.0-x64-mingw32) + ffi (1.17.0) forwardable-extended (2.6.0) - http_parser.rb (0.6.0) - i18n (0.9.5) + google-protobuf (3.25.3) + google-protobuf (3.25.3-aarch64-linux) + google-protobuf (3.25.3-arm64-darwin) + google-protobuf (3.25.3-x86-linux) + google-protobuf (3.25.3-x86_64-darwin) + google-protobuf (3.25.3-x86_64-linux) + http_parser.rb (0.8.0) + i18n (1.14.5) concurrent-ruby (~> 1.0) - jekyll (3.8.5) + jekyll (4.3.3) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) - i18n (~> 0.7) - jekyll-sass-converter (~> 1.0) + i18n (~> 1.0) + jekyll-sass-converter (>= 2.0, < 4.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (~> 2.3, >= 2.3.1) + kramdown-parser-gfm (~> 1.0) liquid (~> 4.0) - mercenary (~> 0.3.3) + mercenary (>= 0.3.6, < 0.5) pathutil (~> 0.9) - rouge (>= 1.7, < 4) + rouge (>= 3.0, < 5.0) safe_yaml (~> 1.0) - jekyll-archives (2.1.1) - jekyll (>= 2.4) - jekyll-feed (0.11.0) - jekyll (~> 3.3) + terminal-table (>= 1.8, < 4.0) + webrick (~> 1.7) + jekyll-archives (2.2.1) + jekyll (>= 3.6, < 5.0) + jekyll-feed (0.17.0) + jekyll (>= 3.7, < 5.0) jekyll-paginate (1.1.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-watch (2.1.2) + jekyll-sass-converter (3.0.0) + sass-embedded (~> 1.54) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-watch (2.2.1) listen (~> 3.0) - kramdown (1.17.0) - liquid (4.0.1) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) - mercenary (0.3.6) + kramdown (2.4.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.4) + listen (3.9.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.4.0) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (3.0.3) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) + public_suffix (5.1.1) + rake (13.2.1) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rouge (3.3.0) - ruby_dep (1.5.0) + rexml (3.3.2) + strscan + rouge (4.3.0) safe_yaml (1.0.5) - sass (3.7.3) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) + sass-embedded (1.63.6) + google-protobuf (~> 3.23) + rake (>= 13.0.0) + strscan (3.1.0) + terminal-table (3.0.2) + unicode-display_width (>= 1.1.1, < 3) + unicode-display_width (2.5.0) + webrick (1.8.1) PLATFORMS + aarch64-linux + aarch64-linux-android + aarch64-linux-musl + arm-linux + arm-linux-androideabi + arm-linux-musl + arm-linux-musleabihf + arm64-darwin ruby - x64-mingw32 + x86-linux + x86-linux-android + x86-linux-musl + x86_64-darwin + x86_64-linux + x86_64-linux-android + x86_64-linux-musl DEPENDENCIES jekyll-archives @@ -78,4 +106,4 @@ DEPENDENCIES rouge BUNDLED WITH - 2.0.1 + 2.4.22 diff --git a/README.md b/README.md index da71f9d5f0..c6c44f7fe4 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,6 @@ -# Mediumish - Jekyll Theme +# Platformatory Blog -[Live Demo](https://wowthemesnet.github.io/mediumish-theme-jekyll/)   |   [Download](https://github.com/wowthemesnet/mediumish-theme-jekyll/archive/master.zip)   |   [Documentation](https://bootstrapstarter.com/template-mediumish-bootstrap-jekyll/)   |   [Buy me a coffee](https://www.wowthemes.net/donate/) +# Contributing -![mediumish](assets/images/mediumish-jekyll-template.png) - - -### Copyright - -Copyright (C) 2019 Sal, https://www.wowthemes.net - -**Mediumish for Jekyll** is designed and developed by [Sal](https://www.wowthemes.net) and it is *free* under MIT license. - -Buy Me A Coffee - -### Contribute - -1. [Fork the repo](https://github.com/wowthemesnet/mediumish-theme-jekyll). -2. Clone a copy of your fork on your local -3. Create a branch off of master and give it a meaningful name (e.g. my-new-mediumish-feature). -4. Make necessary changes, commit, push and open a pull request on GitHub. - -Thank you! +- Fork the repo +- Make a PR diff --git a/_config.yml b/_config.yml index 8f192fabcd..daafef0cab 100644 --- a/_config.yml +++ b/_config.yml @@ -1,35 +1,116 @@ # Site -name: "Mediumish" -title: "Mediumish" -description: "Jekyll template, Medium styled, free for bloggers." -logo: 'assets/images/logo.png' +name: "The Write Ahead Log" +title: "The Write Ahead Log" +description: "Platform and Product Engineering Journal" +logo: 'assets/images/plf-logo.svg' favicon: 'assets/images/logo.png' -baseurl: /mediumish-theme-jekyll -google_analytics: 'UA-46278016-1' -disqus: 'demowebsite' -mailchimp-list: 'https://wowthemes.us11.list-manage.com/subscribe/post?u=8aeb20a530e124561927d3bd8&id=8c3d2d214b' +baseurl: '/blog' +google_analytics: 'G-PWW2W71JX1' +#disqus: 'demowebsite' +mailchimp-list: 'https://platformatory.us17.list-manage.com/subscribe/post?u=8cc2e44e80300ae193f198bcc&id=6246b12a61' include: ["_pages"] permalink: /:title/ # Authors authors: - sal: - name: Sal - display_name: Sal - gravatar: e56154546cf4be74e393c62d1ae9f9d4 - email: wowthemesnet@gmail.com - web: https://www.wowthemes.net - twitter: https://twitter.com/wowthemesnet - description: "Author of Mediumish, a Bootstrap Medium styled template available for WordPress, HTML, Ghost and Jekyll. You are currently previewing Jekyll template demo." - john: - name: John - display_name: John + p6: + name: pavan + display_name: Pavan Keshavamurthy + gravatar: a92e3c1140c93b65879c69ecf47bb1b5 + email: pavan@platformatory.io + web: https://platformatory.io + twitter: https://twitter.com/platformatory + description: "Cofounder at Platformatory. Pavan has handyman experience around systems programming, data, infrastructure and running tech consulting gigs" + badri: + name: badri + display_name: Lakshminarasimhan Parthasarathy + avatar: 'assets/images/lakshmi.jpg' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: lakshmi@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Principal Engineer at Platformatory; Lakshmi comes with the triad of full stack development, operations and data experience" + dasa: + name: dasa + display_name: Dasa Sathyan + avatar: 'assets/images/avatar.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: srikanth@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Sr Consultant at Platformatory;" + ashwin: + name: ashwin + display_name: Ashwin Venkatesan + avatar: 'assets/images/avatar.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: ashwin@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Platform Engineer at Platformatory specialzing in streaming; Ashwin has previously dabbled in data science, machine learning and predicting box office success for Tollywood movies" + venkatesh: + name: venkatesh + display_name: Venkatesh Keshavamurthy + avatar: 'assets/images/avatar.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: srikanth@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Data Engineer at Platformatory;" + avinash: + name: avinash + display_name: Avinash Upadhyaya avatar: 'assets/images/avatar.png' gravatar: b1cc14991db7a456fcd761680bbc8f81 - email: wowthemesnet@gmail.com - web: https://www.wowthemes.net - twitter: https://twitter.com/wowthemesnet - description: "This is the author box. Write a short description of the author here. You are currently previewing Mediumish demo, a Jekyll template compatible with Github pages." + email: avinash@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Platform Engineer at Platformatory specializing in Infrastructure; Avinash is an avid technical evangelist advocating for open source technologies in the local communities through meetups and conferences. He is a Kong Champion and a Confluent Community Catalyst. Apart from consulting, Avinash is an open source contributor with contributions to various projects in the CNCF ecosystem." + raghav: + name: raghav + display_name: Raghavendran Nehru + avatar: 'assets/images/avatar.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: raghav@platformatory.com + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "Director & Practitioner, North America;" + Shivaprakash: + name: Shivaprakash Akki + display_name: Shivaprakash Akki + avatar: 'assets/images/avatar.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: akkishiva@platformatory.com + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "DevOps engineer at Platformatory specializing in Site Reliability Engineering (SRE) and Infrastructure." + Arun: + name: Arun B Bhat + display_name: Arun B Bhat + avatar: 'assets/images/arun.png' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: arun@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "SRE Team Lead at Platformatory" + Subramanya: + name: Subramanya Madhyastha + display_name: Subramanya Madhyastha + avatar: 'assets/images/subramanya.jpg' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: subramanya@platformatory.io + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "DevOps engineer at Platformatory specializing in Site Reliability Engineeing and Infrastructure." + Vikhyat: + name: Vikhyat Shetty + display_name: Vikhyat Shetty + avatar: 'assets/images/vikhyat.jpg' + gravatar: b1cc14991db7a456fcd761680bbc8f81 + email: vikhyat@platformatory.com + web: https://www.platformatory.io + twitter: https://twitter.com/platformatory + description: "DevOps/SRE at Platformatory" # Plugins plugins: @@ -62,9 +143,9 @@ kramdown: line_numbers: true # Adsense (change to "enabled" to activate, also your client id and ad slot. Create a new ad unit from your Adsense account to get the slot.) -adsense: "disabled" -adsense-data-ad-client: "ca-pub-3412143450191416" -adsense-data-ad-slot: "1363087678" +#adsense: "disabled" +#adsense-data-ad-client: "ca-pub-3412143450191416" +#adsense-data-ad-slot: "1363087678" # Lazy Images ("enabled" or "disabled") lazyimages: "disabled" diff --git a/_includes/disqus.html b/_includes/disqus.html index af4d8719c9..f0e2c18f6f 100644 --- a/_includes/disqus.html +++ b/_includes/disqus.html @@ -1,14 +1,23 @@
-
- - - comments powered by Disqus +
+ + +
diff --git a/_includes/featuredbox.html b/_includes/featuredbox.html index 90f1b2902d..73203d737d 100644 --- a/_includes/featuredbox.html +++ b/_includes/featuredbox.html @@ -1,59 +1,79 @@ {% assign author = site.authors[post.author] %}
-
-
-
-
- +
+
+
+
+
+ + {% if site.lazyimages == "enabled" %} {% else %} {% endif %} + +
+
+
+
+
+
+
+

+ + {{ post.title }} + + {% if post.rating %} +
+ {% include star_rating_postbox.html %}
+ {% endif %} +

+ +

+ {{ post.teaser | strip_html | truncatewords:25 }} +

-
-
-
-
-

- {{ post.title }} - {% if post.rating %} -
- {% include star_rating_postbox.html %} -
- {% endif %} -

+ +
+
+
- \ No newline at end of file + diff --git a/_includes/postbox.html b/_includes/postbox.html index 80160e2d43..e8c2e15d99 100644 --- a/_includes/postbox.html +++ b/_includes/postbox.html @@ -1,51 +1,67 @@ {% assign author = site.authors[post.author] %} -
-
- -
-

- {{ post.title }} - {% if post.rating %} -
- {% include star_rating_postbox.html %} -
- {% endif %} -

-

{{ post.excerpt | strip_html | truncatewords:30 }}

+
+
+
+ + {% if post.image %} {% if site.lazyimages == "enabled" %} {{ post.title }} {% + else %} {{ post.title }} {% endif %} {% endif %} + +
+
+

+ + {{ post.title }} + + {% if post.rating %} +
+ {% include star_rating_postbox.html %}
-

+

+ {{ post.teaser | strip_html | truncatewords:30 }} +

+
+ +
- \ No newline at end of file + diff --git a/_includes/postcta.html b/_includes/postcta.html new file mode 100644 index 0000000000..1e96f5e034 --- /dev/null +++ b/_includes/postcta.html @@ -0,0 +1,16 @@ + + \ No newline at end of file diff --git a/_layouts/default.html b/_layouts/default.html index 337b917db3..5400e4daed 100644 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -34,6 +34,7 @@ {% capture layout %}{% if page.layout %}layout-{{ page.layout }}{% endif %}{% endcapture %} +
+
+ {% if page.ctas%} +
+ {% include postcta.html %} +
+ {% endif %} +
@@ -143,13 +241,13 @@

Summary

{% if page.comments != false %} -
-
-
- {% include disqus.html %} -
+
+
+
+ {% include disqus.html %}
+
{% endif %} diff --git a/_posts/2018-01-11-customer-service.md b/_posts/2018-01-11-customer-service.md deleted file mode 100644 index 902b51bfc7..0000000000 --- a/_posts/2018-01-11-customer-service.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -layout: post -title: "Inception Movie" -author: john -categories: [ Jekyll, tutorial ] -tags: [red, yellow] -image: assets/images/11.jpg -description: "My review of Inception movie. Acting, plot and something else in this short description." -featured: true -hidden: true -rating: 4.5 ---- - -Review products, books, movies, restaurant and anything you like on your Jekyll blog with Mediumish! JSON-LD ready for review property. - -#### How to use? - -It's actually really simple! Add the rating in your YAML front matter. It also supports halfs: - -```html ---- -layout: post -title: "Inception Movie" -author: john -categories: [ Jekyll, tutorial ] -tags: [red, yellow] -image: assets/images/11.jpg -description: "My review of Inception movie. Actors, directing and more." -rating: 4.5 ---- -``` diff --git a/_posts/2018-01-11-quick-start-guide.md b/_posts/2018-01-11-quick-start-guide.md deleted file mode 100644 index 178e04003c..0000000000 --- a/_posts/2018-01-11-quick-start-guide.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: post -title: "Let's test spoilers" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/12.jpg -featured: true -hidden: true ---- - -Director Roland Suso Richter's enigmatic psychological thriller (direct to video/DVD) was based upon screenwriter Michael Cooney's own play "Point of Death" - a title that gave away the film's entire plot twist premise. - -As in many similar films, such as Jacob's Ladder (1990), Soul Survivors (2001), and The Butterfly Effect (2004), events and people were thoroughly distorted and confused because the protagonist was at the point of death. The tagline was misleading: - -"When You Don't Have a Memory, How Can You Remember Who to Trust?" - -The mind-warping film opened with a hospital patient Simon Cable (Ryan Phillippe) awakening in a hospital with little knowledge (amnesia perhaps?) of what had happened, and why he was there, etc. He was told by attending Dr. Jeremy Newman (Stephen Rea) that it was July 29, 2002 (Simon thought it was the year 2000 - he was confused - he heard a doctor say 20:00 hours!) and that he had died for two minutes from cardiac arrest following the near-fatal accident -- but he had been revived ("You're as good as new"). Dr. Newman: "Simon, this is the 29th of July. The year is 2002. And your wife, whose name is Anna, is waiting outside." - -(The doctor left off four crucial additional words, revealed in the film's ending.) (Spoiler: Simon had died and was not resuscitated!). - -A major clue to everything that truly happened was the scene that played next under the credits - hospital staff failed to bring a patient back to life with a defibrillator after a car accident. Chest compressions failed and there was no pulse. A second major clue was provided by hospital orderly Travis (Stephen Graham): Everybody dies. No mystery there. But why and how everyone dies. Now, there's a mystery worth solving. Probably the biggest mystery there is. - -#### So how do we do spoilers? - -```html -My hidden paragraph here. -``` diff --git a/_posts/2018-01-12-is-intelligence-enough.md b/_posts/2018-01-12-is-intelligence-enough.md deleted file mode 100644 index 707f8d97af..0000000000 --- a/_posts/2018-01-12-is-intelligence-enough.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -layout: post -title: "External Featured Image" -author: sal -categories: [ Jekyll, tutorial, web development ] -image: "https://images.unsplash.com/photo-1541544537156-7627a7a4aa1c?ixlib=rb-0.3.5&ixid=eyJhcHBfaWQiOjEyMDd9&s=a20c472bc23308e390c8ffae3dd90c60&auto=format&fit=crop&w=750&q=80" ---- -Education must also train one for quick, resolute and effective thinking. To think incisively and to think for one's self is very difficult. - -> We are prone to let our mental life become invaded by legions of half truths, prejudices, and propaganda. At this point, I often wonder whether or not education is fulfilling its purpose. A great majority of the so-called educated people do not think logically and scientifically. - -Even the press, the classroom, the platform, and the pulpit in many instances do not give us objective and unbiased truths. To save man from the morass of propaganda, in my opinion, is one of the chief aims of education. Education must enable one to sift and weigh evidence, to discern the true from the false, the real from the unreal, and the facts from the fiction. - -The function of education, therefore, is to teach one to think intensively and to think critically. But education which stops with efficiency may prove the greatest menace to society. The most dangerous criminal may be the man gifted with reason, but with no morals. - -The late Eugene Talmadge, in my opinion, possessed one of the better minds of Georgia, or even America. Moreover, he wore the Phi Beta Kappa key. By all measuring rods, Mr. Talmadge could think critically and intensively; yet he contends that I am an inferior being. Are those the types of men we call educated? - -We must remember that intelligence is not enough. Intelligence plus character--that is the goal of true education. The complete education gives one not only power of concentration, but worthy objectives upon which to concentrate. The broad education will, therefore, transmit to one not only the accumulated knowledge of the race but also the accumulated experience of social living. - diff --git a/_posts/2018-01-12-never-stopped-worrying-never-loved-bomb.md b/_posts/2018-01-12-never-stopped-worrying-never-loved-bomb.md deleted file mode 100644 index e790de4793..0000000000 --- a/_posts/2018-01-12-never-stopped-worrying-never-loved-bomb.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: post -title: "Never stopped worrying or loving the bomb" -author: sal -categories: [ Jekyll ] -image: assets/images/14.jpg ---- -> I’ve been through fire and water, I tell you! From my earliest pebblehood the wildest things you could imagine have been happening to this world of ours, and I have been right in the midst of them. - -So begins Hallam Hawksworth’s The Strange Adventures of a Pebble. Written in the 1920s, the book was part of a series which also included The Adventures of a Grain of Dust and A Year in the Wonderland of Trees, all of which were supposed to introduce children to the world of Natural Sciences. In each of them, Hawksworth personifies the natural object he is exploring, and using a mixture of folk tales, scientific facts and colloquial, friendly explanations guides the reader through the history of the natural world. It’s a real thrill of a ride, dramatizing the life cycle of supposedly dull things. The Adventures of a Grain of Dust begins even more loudly than Pebble: - -I don’t want you to think that I’m boasting, but I do believe I’m one of the greatest travellers that ever was; and if anybody, living or dead, has ever gone through with more than I have I’d like to hear about it. -Hallam Hawksworth was the pen-name of teacher Francis Blake Atkinson. He was married to the author Eleanor Stackhouse Atkinson, author of the children’s classic Greyfriars Bobby, which was based on the (supposedly) true story of a Scottish dog who spent fourteen years guarding his masters grave. The couple were both committed to education and published a weekly magazine for Chicago high school students called The Little Chronicle, as well as working for Encyclopaedia companies later in life. diff --git a/_posts/2018-01-12-options-for-creating-new-site-with-jekyll.md b/_posts/2018-01-12-options-for-creating-new-site-with-jekyll.md deleted file mode 100644 index c71c09e484..0000000000 --- a/_posts/2018-01-12-options-for-creating-new-site-with-jekyll.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -layout: post -title: "Options for creating a new site with Jekyll" -author: john -categories: [ Jekyll, tutorial ] -image: assets/images/13.jpg ---- - -`jekyll new ` installs a new Jekyll site at the path specified (relative to current directory). In this case, Jekyll will be installed in a directory called `myblog`. Here are some additional details: - -- To install the Jekyll site into the directory you're currently in, run `jekyll new` . If the existing directory isn't empty, you can pass the --force option with jekyll new . --force. -- `jekyll new` automatically initiates `bundle install` to install the dependencies required. (If you don't want Bundler to install the gems, use `jekyll new myblog --skip-bundle`.) -- By default, the Jekyll site installed by `jekyll new` uses a gem-based theme called Minima. With gem-based themes, some of the directories and files are stored in the theme-gem, hidden from your immediate view. -- We recommend setting up Jekyll with a gem-based theme but if you want to start with a blank slate, use `jekyll new myblog --blank` -- To learn about other parameters you can include with `jekyll new`, type `jekyll new --help`. \ No newline at end of file diff --git a/_posts/2018-01-12-press-and-education.md b/_posts/2018-01-12-press-and-education.md deleted file mode 100644 index 98e8c70d58..0000000000 --- a/_posts/2018-01-12-press-and-education.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -layout: post -title: "Press and education" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/7.jpg ---- -Even the press, the classroom, the platform, and the pulpit in many instances do not give us objective and unbiased truths. To save man from the morass of propaganda, in my opinion, is one of the chief aims of education. Education must enable one to sift and weigh evidence, to discern the true from the false, the real from the unreal, and the facts from the fiction. - -Education must also train one for quick, **resolute and effective thinking**. To think incisively and to think for one's self is very difficult. - -> We are prone to let our mental life become invaded by legions of half truths, prejudices, and propaganda. At this point, I often wonder whether or not education is fulfilling its purpose. A great majority of the so-called educated people do not think logically and scientifically. - - -The function of education, therefore, is to teach one to think intensively and to think critically. But education which stops with efficiency may prove the greatest menace to society. The most dangerous criminal may be the man gifted with reason, but with no morals. - -The late Eugene Talmadge, in my opinion, possessed one of the better minds of Georgia, or even America. Moreover, he wore the Phi Beta Kappa key. By all measuring rods, Mr. Talmadge could think critically and intensively; yet he contends that I am an inferior being. Are those the types of men we call educated? - -We must remember that intelligence is not enough. Intelligence plus character--that is the goal of true education. The complete education gives one not only power of concentration, but worthy objectives upon which to concentrate. The broad education will, therefore, transmit to one not only the accumulated knowledge of the race but also the accumulated experience of social living. - diff --git a/_posts/2018-01-12-red-riding.md b/_posts/2018-01-12-red-riding.md deleted file mode 100644 index 66ad47606a..0000000000 --- a/_posts/2018-01-12-red-riding.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: "Red Riding Hood" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/1.jpg ---- -The first mass-produced book to deviate from a rectilinear format, at least in the United States, is thought to be this 1863 edition of Red Riding Hood, cut into the shape of the protagonist herself with the troublesome wolf curled at her feet. Produced by the Boston-based publisher Louis Prang, this is the first in their “Doll Series”, a set of five “die-cut” books, known also as shape books — the other titles being Robinson Crusoe, Goody Two-Shoes (also written by Red Riding Hood author Lydia Very), Cinderella, and King Winter. - -An 1868 Prang catalogue would later claim that such “books in the shape of a regular paper Doll… originated with us”. - -> It would seem the claim could also extend to die cut books in general, as we can’t find anything sooner, but do let us know in the comments if you have further light to shed on this! Such books are, of course, still popular in children’s publishing today, though the die cutting is not now limited to mere outlines, as evidenced in a beautiful 2014 version of the same Little Red Riding Hood story. - -The die cut has also been employed in the non-juvenile sphere as well, a recent example being Jonathan Safran Foer’s ambitious Tree of Codes. - -As for this particular rendition of Charles Perrault’s classic tale, the text and design is by Lydia Very (1823-1901), sister of Transcendentalist poet Jones Very. The gruesome ending of the original — which sees Little Red Riding Hood being gobbled up as well as her grandmother — is avoided here, the gore giving way to the less bloody aims of the morality tale, and the lesson that one should not disobey one’s mother. diff --git a/_posts/2018-01-12-tree-of-codes.md b/_posts/2018-01-12-tree-of-codes.md deleted file mode 100644 index e5ccc4fbab..0000000000 --- a/_posts/2018-01-12-tree-of-codes.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -layout: post -title: "Tree of Codes" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/9.jpg ---- -The first mass-produced book to deviate from a rectilinear format, at least in the United States, is thought to be this 1863 edition of Red Riding Hood, cut into the shape of the protagonist herself with the troublesome wolf curled at her feet. Produced by the Boston-based publisher Louis Prang, this is the first in their “Doll Series”, a set of five “die-cut” books, known also as shape books — the other titles being Robinson Crusoe, Goody Two-Shoes (also written by Red Riding Hood author Lydia Very), Cinderella, and King Winter. - -As for this particular rendition of Charles Perrault’s classic tale, the text and design is by Lydia Very (1823-1901), sister of Transcendentalist poet Jones Very. The gruesome ending of the original — which sees Little Red Riding Hood being gobbled up as well as her grandmother — is avoided here, the gore giving way to the less bloody aims of the morality tale, and the lesson that one should not disobey one’s mother. - -> It would seem the claim could also extend to die cut books in general, as we can’t find anything sooner, but do let us know in the comments if you have further light to shed on this! Such books are, of course, still popular in children’s publishing today, though the die cutting is not now limited to mere outlines, as evidenced in a beautiful 2014 version of the same Little Red Riding Hood story. - - -An 1868 Prang catalogue would later claim that such “books in the shape of a regular paper Doll… originated with us”. - -The die cut has also been employed in the non-juvenile sphere as well, a recent example being Jonathan Safran Foer’s ambitious Tree of Codes. - diff --git a/_posts/2018-01-12-we-all-wait-for-summer.md b/_posts/2018-01-12-we-all-wait-for-summer.md deleted file mode 100644 index a8683bc721..0000000000 --- a/_posts/2018-01-12-we-all-wait-for-summer.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -layout: post -title: "We all wait for summer" -author: john -categories: [ Jekyll, tutorial ] -image: assets/images/5.jpg -rating: .5 ---- -As I engage in the so-called "bull sessions" around and about the school, I too often find that most college men have a misconception of the purpose of education. Most of the "brethren" think that education should equip them with the proper instruments of exploitation so that they can forever trample over the masses. Still others think that education should furnish them with noble ends rather than means to an end. - -It seems to me that education has a two-fold function to perform in the life of man and in society: the one is utility and the other is culture. Education must enable a man to become more efficient, to achieve with increasing facility the ligitimate goals of his life. \ No newline at end of file diff --git a/_posts/2018-05-12-about-bundler.md b/_posts/2018-05-12-about-bundler.md deleted file mode 100644 index 5b289c0374..0000000000 --- a/_posts/2018-05-12-about-bundler.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -layout: post -title: "About Bundler" -author: sal -categories: [ Jekyll ] -image: assets/images/2.jpg -rating: 3 ---- -`gem install bundler` installs the bundler gem through RubyGems. You only need to install it once - not every time you create a new Jekyll project. Here are some additional details: - -`bundler` is a gem that manages other Ruby gems. It makes sure your gems and gem versions are compatible, and that you have all necessary dependencies each gem requires. - -The `Gemfile` and `Gemfile.lock` files inform `Bundler` about the gem requirements in your site. If your site doesn’t have these Gemfiles, you can omit `bundle exec` and just `run jekyll serve`. - -When you run `bundle exec jekyll serve`, `Bundler` uses the gems and versions as specified in `Gemfile.lock` to ensure your Jekyll site builds with no compatibility or dependency conflicts. - -For more information about how to use `Bundler` in your Jekyll project, this tutorial should provide answers to the most common questions and explain how to get up and running quickly. diff --git a/_posts/2018-06-12-acumulated-experience.md b/_posts/2018-06-12-acumulated-experience.md deleted file mode 100644 index 40b880569e..0000000000 --- a/_posts/2018-06-12-acumulated-experience.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: "Accumulated experience of social living" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/15.jpg ---- -The die cut has also been employed in the non-juvenile sphere as well, a recent example being Jonathan Safran Foer’s ambitious Tree of Codes. - -As for this particular rendition of Charles Perrault’s classic tale, the text and design is by Lydia Very (1823-1901), sister of Transcendentalist poet Jones Very. The gruesome ending of the original - which sees Little Red Riding Hood being gobbled up as well as her grandmother - is avoided here, the gore giving way to the less bloody aims of the morality tale, and the lesson that one should not disobey one’s mother. - -The first mass-produced book to deviate from a rectilinear format, at least in the United States, is thought to be this 1863 edition of Red Riding Hood, cut into the shape of the protagonist herself with the troublesome wolf curled at her feet. Produced by the Boston-based publisher Louis Prang, this is the first in their “Doll Series”, a set of five “die-cut” books, known also as shape books — the other titles being Robinson Crusoe, Goody Two-Shoes (also written by Red Riding Hood author Lydia Very), Cinderella, and King Winter. - -An 1868 Prang catalogue would later claim that such “books in the shape of a regular paper Doll... originated with us”. - -> It would seem the claim could also extend to die cut books in general, as we can’t find anything sooner, but do let us know in the comments if you have further light to shed on this! Such books are, of course, still popular in children’s publishing today, though the die cutting is not now limited to mere outlines, as evidenced in a beautiful 2014 version of the same Little Red Riding Hood story. \ No newline at end of file diff --git a/_posts/2018-06-12-education.md b/_posts/2018-06-12-education.md deleted file mode 100644 index 50e19abe61..0000000000 --- a/_posts/2018-06-12-education.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -layout: post -title: "Education must also train one for quick, resolute and effective thinking." -author: john -categories: [ Jekyll, tutorial ] -image: assets/images/3.jpg -beforetoc: "Markdown editor is a very powerful thing. In this article I'm going to show you what you can actually do with it, some tricks and tips while editing your post." -toc: true ---- -There are lots of powerful things you can do with the Markdown editor - -If you've gotten pretty comfortable with writing in Markdown, then you may enjoy some more advanced tips about the types of things you can do with Markdown! - -As with the last post about the editor, you'll want to be actually editing this post as you read it so that you can see all the Markdown code we're using. - - -## Special formatting - -As well as bold and italics, you can also use some other special formatting in Markdown when the need arises, for example: - -+ ~~strike through~~ -+ ==highlight== -+ \*escaped characters\* - - -## Writing code blocks - -There are two types of code elements which can be inserted in Markdown, the first is inline, and the other is block. Inline code is formatted by wrapping any word or words in back-ticks, `like this`. Larger snippets of code can be displayed across multiple lines using triple back ticks: - -``` -.my-link { - text-decoration: underline; -} -``` - -If you want to get really fancy, you can even add syntax highlighting using Rouge. - - -![walking]({{ site.baseurl }}/assets/images/8.jpg) - -## Reference lists - -The quick brown jumped over the lazy. - -Another way to insert links in markdown is using reference lists. You might want to use this style of linking to cite reference material in a Wikipedia-style. All of the links are listed at the end of the document, so you can maintain full separation between content and its source or reference. - -## Full HTML - -Perhaps the best part of Markdown is that you're never limited to just Markdown. You can write HTML directly in the Markdown editor and it will just work as HTML usually does. No limits! Here's a standard YouTube embed code as an example: - -

\ No newline at end of file diff --git a/_posts/2018-06-12-first-mass-produced.md b/_posts/2018-06-12-first-mass-produced.md deleted file mode 100644 index 605b9519df..0000000000 --- a/_posts/2018-06-12-first-mass-produced.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: post -title: "The first mass-produced book to deviate from a rectilinear format" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/17.jpg ---- -The first mass-produced book to deviate from a rectilinear format, at least in the United States, is thought to be this 1863 edition of Red Riding Hood, cut into the shape of the protagonist herself with the troublesome wolf curled at her feet. Produced by the Boston-based publisher Louis Prang, this is the first in their “Doll Series”, a set of five “die-cut” books, known also as shape books — the other titles being Robinson Crusoe, Goody Two-Shoes (also written by Red Riding Hood author Lydia Very), Cinderella, and King Winter. - -An 1868 Prang catalogue would later claim that such “books in the shape of a regular paper Doll... originated with us”. - -> It would seem the claim could also extend to die cut books in general, as we can’t find anything sooner, but do let us know in the comments if you have further light to shed on this! Such books are, of course, still popular in children’s publishing today, though the die cutting is not now limited to mere outlines, as evidenced in a beautiful 2014 version of the same Little Red Riding Hood story. - -The die cut has also been employed in the non-juvenile sphere as well, a recent example being Jonathan Safran Foer’s ambitious Tree of Codes. - -As for this particular rendition of Charles Perrault’s classic tale, the text and design is by Lydia Very (1823-1901), sister of Transcendentalist poet Jones Very. The gruesome ending of the original - which sees Little Red Riding Hood being gobbled up as well as her grandmother - is avoided here, the gore giving way to the less bloody aims of the morality tale, and the lesson that one should not disobey one’s mother. diff --git a/_posts/2018-06-12-powerful-things-markdown-editor.md b/_posts/2018-06-12-powerful-things-markdown-editor.md deleted file mode 100644 index 2d0f3f594f..0000000000 --- a/_posts/2018-06-12-powerful-things-markdown-editor.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -layout: post -title: "Powerful things you can do with the Markdown editor" -author: sal -categories: [ Jekyll, tutorial ] -image: assets/images/16.jpg ---- -There are lots of powerful things you can do with the Markdown editor. If you've gotten pretty comfortable with writing in Markdown, then you may enjoy some more advanced tips about the types of things you can do with Markdown! - -As with the last post about the editor, you'll want to be actually editing this post as you read it so that you can see all the Markdown code we're using. - - -## Special formatting - -As well as bold and italics, you can also use some other special formatting in Markdown when the need arises, for example: - -+ ~~strike through~~ -+ ==highlight== -+ \*escaped characters\* - - -## Writing code blocks - -There are two types of code elements which can be inserted in Markdown, the first is inline, and the other is block. Inline code is formatted by wrapping any word or words in back-ticks, `like this`. Larger snippets of code can be displayed across multiple lines using triple back ticks: - -``` -.my-link { - text-decoration: underline; -} -``` - -#### HTML - -```html -
  • - - - -
  • -``` - -#### CSS - -```css -.highlight .c { - color: #999988; - font-style: italic; -} -.highlight .err { - color: #a61717; - background-color: #e3d2d2; -} -``` - -#### JS - -```js -// alertbar later -$(document).scroll(function () { - var y = $(this).scrollTop(); - if (y > 280) { - $('.alertbar').fadeIn(); - } else { - $('.alertbar').fadeOut(); - } -}); -``` - -#### Python - -```python -print("Hello World") -``` - -#### Ruby - -```ruby -require 'redcarpet' -markdown = Redcarpet.new("Hello World!") -puts markdown.to_html -``` - -#### C - -```c -printf("Hello World"); -``` - - - - -![walking]({{ site.baseurl }}/assets/images/8.jpg) - -## Reference lists - -The quick brown jumped over the lazy. - -Another way to insert links in markdown is using reference lists. You might want to use this style of linking to cite reference material in a Wikipedia-style. All of the links are listed at the end of the document, so you can maintain full separation between content and its source or reference. - -## Full HTML - -Perhaps the best part of Markdown is that you're never limited to just Markdown. You can write HTML directly in the Markdown editor and it will just work as HTML usually does. No limits! Here's a standard YouTube embed code as an example: - -

    \ No newline at end of file diff --git a/_posts/2021-05-30-platform-engineering-manifesto.md b/_posts/2021-05-30-platform-engineering-manifesto.md new file mode 100644 index 0000000000..f3452a96e2 --- /dev/null +++ b/_posts/2021-05-30-platform-engineering-manifesto.md @@ -0,0 +1,67 @@ +--- +layout: post +title: "A Platform engineering manifesto" +author: p6 +categories: + [Platform Engineering, Data, Infrastructure, Kafka, Kong, Kubernetes] +image: assets/blog-images/platform-strategy.svg +featured: true +hidden: true +cat_box_title: Contact Us +ctas: + - title: Get In Touch + description: "Have questions or need assistance? Our team is here to help" + url: "/contact/" + +teaser: Platform Engineering is dead. Long live platform engineering! +toc: true +--- + +![50-shades](/blog/assets/blog-images/50-shades.jpg "50 Shades of DevOps") + +There are many hot takes on Platform Engineering. There are success stories and there are horror stories. When done well, there is a team topology that enables a "do it like a digital native" operating model. When done not so well, you end up with a mess of 99 problems cherrypicked straight from the CNCF landscape graveyard of tools. + +As a company that has chosen to ride the trend, our belief is that platform engineering should be considered essential for any tech company operating in a complex domain, basically with the objective of providing a Dev X that helps you scale. Platforms don't exist in a vaccum; They are grounded ultimately in the reality of business domains and MUST accelerate product engineering teams in their endeavours. + +To be clear, not everyone needs a platform engineering team in the first place. One is largely justified when the size of the engineering organization exceeds the Dunbar number or atleast a non-trivial problem of scale. + +Nonetheless, if you do choose to build a platform engineering team, there are 3 key performance indicators to consider in order to measure success. + +- NPS, CSAT from product engineering teams, as a measure of developer experience +- Usage metrics of the platform products in themselves +- Coverage of platform concerns and time to first production deployment (for Product teams) + +# Own a platform architecture & platform product vision + +Everything begins in robust product foundations and an (evolving) architecture vision. Platform teams must own both a product & an architecture charter. The unique requirements of platform products may themselves be driven by unique challenges in the domain (or specifically that of domain centred product engineering teams) and the incumbent IT landscape + +> ProTip: A pareto principle compliant north star vision of a robust platform architecture must think through the following concenrs: Smart Infrastructure, Data Integration & APIs. + +# Building tooling and plugins: For the sake of developers, operators and everything in between + +The platform is a product that is mainly built with engineers in mind. You build it, you run it is ultimately an elusively aspirational operating model. In reality, re-use dictates efficiency and continuous reinvention of the wheel is harmful to productivity, which is almost a side-effect of autonomous but silo'd teams. + +> A platform team topology on the other hand vests a lean, craft-focused engineering team to build specialization across a variety of cross cutting concerns while enabling a well governed tech estate. + +Such "Platform engineering" teams are suited ideally to building re-usable artefacts and easily adapted tooling for most concerns. Notably, this will include for example: + +- Infrastructure as code modules +- Golden images (as also other build-time, run-time tooling) +- Common Build / Deploy (CI-CD) Pipelines +- Observability & monitoring toolkits + +# Enable shared-services Operations (securely) in a multi-tenant, hybrid cloud enterprise + +At the heart of enterprise grade operations is really a tenancy problem; Tenancy is really "who shares what in the house"; A platform-based operating model needs to provide mechanisms for housing tenants in dedicated or shared models, based on resource competition (and thus need for isolation), security, availability, compliance and cost requirements. + +Operating model flexibility should also provide for having resources managed directly via embedded DevOps teams that "build and run" things or shared SRE (or IT/Cloud-ops teams) that takeover this function for several applications or teams. + +> Protip: There is a need for governance as well as potentially cost attribution or chargeback to teams that foot the bill. + +# Specialize in running complex, distributed systems + +Fully managed services attempt to dumb down details of large scale distributed system, yet many are actually very leaky abstractions. A trade-off of having cloud vendor neutrality is that one must accept the reality of having fully or partially managed services co-existing with fully self-managed ones, particularly when costs become prohibitive either due to scale (or the lack thereof). This poses a dilemma for technology leaders around the operating model fit. + +There is a defining line of separation between quality platform engineering teams and vanilla (cloud\|dev)-ops teams. Platform engineers will need to gain specific expertise in running complex, distributed systems that have become de-facto platform primitives, providing lego-blocks that allow you to build higher order platforms. + +> Platform adoption is ultimately constrained by expertise and therefore it is crucial that platform engineering teams build highly artisanal guilds that can respond to architectural problems in distributed systems and provide a POV on performance, security an d a variety of cross cutting concerns. diff --git a/_posts/2021-05-31-three-pillars-of-platform-architecture.md b/_posts/2021-05-31-three-pillars-of-platform-architecture.md new file mode 100644 index 0000000000..d4d6075a52 --- /dev/null +++ b/_posts/2021-05-31-three-pillars-of-platform-architecture.md @@ -0,0 +1,84 @@ +--- +layout: post +title: "The 3 pillars of Platform architecture" +author: p6 +categories: [ Platform Engineering, Data, Infrastructure, Kafka, Kong, Kubernetes ] +image: assets/blog-images/platform-arch-model.svg +teaser: Data, APIs and Cloud native infrastructure +featured: true +hidden: true +toc: true +--- + +Platform Engineering is a hot topic. There is an optimist's view: That it can potentially simplify the tech operating model of large, complex organizations. Yet there is also a cynical take: This is no panacea for conway problems. + +The crucial question is, what even should be extracted into a "platform" boundary? Greedy platforms tend to stifle team autonomy. Loose platforms tend to add little value and more harm anything else. + +Allow us to provide a take on what we think should be the three most important pillars for a sound platform strategy. Our recommendation is based on real-life experience of enabling platform-led transformations and areas that have proved most conducive for such an operating model, while providing clear, discernbile business value. We will also talk through some universal platform building blocks that you must consider. + + +## Hybrid Cloud infrastructure platform + +In the ever evolving cloud landscape, hardware comes cheap and commoditized. While cloud vendors will still seek to differentiate, it is now amply clear that containers are pretty much the standard level of atomicity for any workload. Therefore, the underlying IaaS, at least in theory, is very fungible. + +This also signals the rise of Kubernetes as a great leveler and making multi-cloud – hybrid-cloud strategies actually viable without rocket surgery. + +As Kelsey Hightower puts it "Kubernetes is the interface to the cloud". If there's one thing K8S has cracked, it is the declarative control plane API model, and by that it ensures that even if a competitor were to emerge, it would begin by baselining itself against the same API standard. + +> We will go out on a limb and make another prediction: in the same spirit of Atwood's law: Any sufficiently complicated distributed system that can be natively runon Kubernetes will eventually be run as an operator. + +That means, for those who may need to steer away from managed services for whatever reason, running an operator is very literally the next best thing. We expect K8S will be the chassis for building literally just about anything cloud native in the next 10 years; PaaS vendors will come up with even better and specialized Developer and operator experiences to run K8S-at-the-core apps with lesser hassle. + +There are naysayers who think K8S is just too complex (there maybe a hint of truth in this even in 2022), but the sheer rate of increasing production workloads going into k8s is obviously data based testimony staring at you in the face that this is hardly as complex as they say it is. This is definitely led by the cloud providers themselves, but there is also a long tail of hybrid cloud compatible K8S distributions (such as VMWare Tanzu, RedHat OpenShift, Rancher, Nirmata, amongst others) that are enabling this. There are discretionary choices available for which flavor of Kubernetes suits you best and the vendor posture is towards K8S not as infrastructure modernization but as the foundational need for modernizing apps in itself; + +Even capabilities that were seemingly a point of proprietary leverage for public cloud, such as serverless functions have now gained a first class foothold in Kubernetes, allowing product teams to build towards a deployment target that may run on any IaaS, with Kubernetes. + +Our advice - if you are willing to make some investments towards Kubernetes, it is about time. Do it. It will certainly prove to be a very robust foundation for building higher order platforms. + + +## The Data exchange platform + +As DB sizes grow, it is inevitable that megaliths and monoliths must die; Shared state and data just has practical limits; Services must own data and therefore, enable themselves towards decomposition into smaller parts and distribution. When that happens, the needs for integrations arise. + +The platform opportunities in integration are no recent discovery. Messaging platforms have been around for decades. In the cloud native world, Enterprise integration patterns may still hold (give or take), but the aforementioned integration technologies no longer make sense for microservices communication. The services get smaller, chattier and desire eventual consistency patterns which these systems suck at, not to mention that they barely lend themselves to distributed, hybrid cloud deployments without being bottlenecks in themselves. + +There's a second, larger problem: Data integration. Analytical problem space increasingly converges into the operational (this is a great thing) and therefore real time data integration is a much bigger problem than ever. Traditional big data approaches fail to fulfill the needs of business expectations where time-to-useful-data (whether be it insight, reporting or other analytics) has to be optimized to its bounds. + +The best way to solve for both,is through embracing Event Driven, Reactive architecture as the de-facto way to architect applications. In this model: + +1. Choreography must be preferred over orchestration. +2. Applications and services exchange data through streams: Event stream processing becomes the mechanism to synchronize and process data continuously and consistently. +3. Most integrations can be handled through the same eventing backbone: An event store holds the logstream of events that can be applied (and replayed if necessary) to derive the state of an application + +Note that this problem has large-scale distributed system underpinnings to it. As it emerges, it is clear that Apache Kafka is the standout winner for event streaming. Its use-cases are myriad: Everything from (S)EDA, event-sourcing, stream processing to data movement. Kafka solves core problems, such as that of exactly once Processing, a brokered, pub-sub API model that offers the highest number of tuning knobs and customizability to fit purpose, first class data persistence for reliable processing and lends itself to global scale, high throughput, ow latency, fault tolerant deployments. + +Apart from these, Kafka also offers an ecosystem of connectors, stream processing bolt ons and schema management. + +> Most of all, similar to K8S, Apache Kafka's data plane protocol has emerged as the defacto API standard for brokered eventing. This is evident in that even with Kafka's emergent competition, such as Azure event hubs, Apache pulsar or the new kid on the block, RedPanda (formerly vectorized.io) – that are innovating in the path of Kafka (in arguably superior ways), there is still conformance and compatibility to the same API protocol. + +The Kafka based pub-sub streaming model is clearly emerging as the integration & data backbone of the enterprise. Data and eventing mesh like architectures will evolve on top of Kafka as the data plane standard and enable Data Productization that will be owned and stewarded by domains. App and services integrations will on the other hand, leverage changelog streams and increasingly lead to event-sourced architectures with newer Kafka centric API standards (such as AsyncAPI, CloudEvents) being the guardrails. + +Our advise - if there's one thing you'd want to do to tame data across the enterprise, consider a platform architecture with a "universal" stream bus like Kafka. This will certainly provide definitive ROI as an operational platform, while unlocking possibilities in real-time analytics, AI/ML and traditional big data processing. + +## The Connectivity platform + +APIs are the very connective tissue of the modern enterprise. + +The explosion of services in the SOA era has resulted in the emergence of API Management as a formal discipline, over and above classic integration middleware platforms (such as ESBs); API Platforms focus on API based mediation, composition patterns and policy based controls on security, traffic management and quotas. + +While API Management has traditionally applied in SOA context, the microservices landscape brings in a different set of challenges than the Northbound-Southbound communication that API gateways traditionally addressed. This is the challenge of inter-service communication or East-West traffic. This is the emerging solution space of service meshes, where the opportunities include the same as that of API gateways and API management (albeit with more intelligent proxy or agent sidecars), but also extending to routing, load balancing, circuit breaking, traffic shifting, mirroring and most of all, observability through tracing across a graph of services. + +Till date, there have been attempts to converge gateways and service meshes, but in our opinion, the most complete vision towards this is by Kong (and to a lesser extent, Solo.io). Kong offers a service mesh (Kuma - albeit not SMI compatible, uses Envoy as the data plane), an API gateway (nginx in the data plane) and a programmable plugin layer that supports Lua, Golang, Python and JS. + +While it is unfair to say Kong is in the de-facto standard of it’s category class as Kafka and K8S, we find it the most plausible candidate worthy of that status eventually; Kong’s core concepts reduce to services and upstreams, routes and consumers, which maybe declaratively targeted by plugins. Kong boasts a large ecosystem of plugins and forks (such as mentionable Apisix), support for HTTP2, gRPC on both VMs, as well as K8S (with K8s ingress); Most of all, Kong provides typically an order of magnitude lift in performance vis-a-vis others and this constitutes a very compelling proposition. + +> The coalescence of API gateways and Service meshes has only begun. We predict Envoy based data-planes will become better and more efficient; We will increasingly look at architectures that not only externalize all network level cross cutting concerns (such as security, traffic management, routing) to the ubiquitous Envoy-at-the-core mesh layer, but also see much “smarter” wire level interception capabilities for different technologies, thus enabling sophisticated capabilities around resilience, load balancing and most of all, the API itself as an abstraction. + +This is only the beginning, but there are several interesting possibilities just in the networking world with eBPF at the core. On the other hand, HTTP/2 is emerging and there is a rebound of RPC protocols (gRPC, Thrift, Dubbo) that provide consistent typing and deep, idiomatic programming language support; Finally, we also recognize heterogeneous API standards emerging (GraphQL, AsyncAPI, CloudEvents to name some) for a variety of use-cases beyond the core request-response oriented REST/SOAP standards that have been prevalent; To address these and other possibilities, if we had to make a choice today - it would almost certainly be Kong. + +Finally, we must stress that APIs are just endpoints unless bundled together or built to suit use-cases of their audiences (ie, developers); For API Management to mature in an organization (regardless of API gateway or service mesh infrastructure), the curation of API products (or in an even more abstract sense, integration products) is the most important goal. This enables a transformation from APIs as integration endpoints towards APIs being literally, your business as a service. + + +# Conclusion + +There is clearly the emergence of a "k3" platform stack, consisting of Kubernetes, Kafka, Kong. These address respectively, the three most important pillars of an enterprise digital platform strategy - namely, infrastructure, data and APIs. For the discerning CT(I)O or platform engineering leader, the opportunities are vast and the OSS community has a lot to offer. It is important to take note and strategize accordingly. diff --git a/_posts/2023-03-03-SaaS-architecture-primer.md b/_posts/2023-03-03-SaaS-architecture-primer.md new file mode 100644 index 0000000000..dbe79bf7e7 --- /dev/null +++ b/_posts/2023-03-03-SaaS-architecture-primer.md @@ -0,0 +1,73 @@ +--- +layout: post +title: "Architecture primer for upstart SaaS founders" +author: p6 +categories: [ Platform Engineering, Data, SaaS, API Metering ] +image: assets/blog-images/mainfesto.jpg +featured: true +hidden: true +teaser: 5 areas to plan upfront & build right +toc: true +--- + + +# Architecture isn't dead + +Founders have a lot on their hand. Even seasoned engineering leaders struggle with the dilemma of choices when the primary pressure is the market opportunity and everything is hinged on time to ship. Technical debt is pretty much the state of affairs. Architecture thinking might be frowned upon or looked at as unrealistic. It is true that, "architecture upfront" is a red herring, particularly at early stage, in a volatile setting. + +Nonetheless, we can't stress enough about the importance of a north star vision and guard rails even in evolving architectures. Truly emergent architectures don't just emerge organically. Rather the most successful ones are almost always the result of good principles chosen with careful deliberation. + +# Everything is a tenancy problem. + +The SaaS economy is fueled by multi-tenancy. The more, the merrier. However, this spawns its own set of issues. Even the most mature SaaS companies struggle with these problems: + +- Noisy Neighbours: Likely to degrade performance for others +- Freeloaders: doing more than what they should be, at their pricing tier +- Under-utilizers: not getting enough bang for buck. Churn risk + +The reality is that "pay for only what you use" is notoriously difficult to implement in practice, as it puts a huge amount of pressure on what to meter, thus spawning notoriously difficult to understand (and unpredictable) billing models. No wonder many SaaS companies end up pricing their solutions with a tenant quota around a facade of some business value, while many still struggle ending up passing infrastructure tiered offerings. + +In our own working experience, we have seen even mature, hyperscale SaaS providers struggle to understand usage patterns and fairness of their billing models. The other side of the same problem is a fin-ops one, of cost attribution of compute and infrastructure resources, relative to pricing subscribed. + +The least you must do is to start with the following fin-ops principles: + +- Price the solution to the customer around the most self-evident business metric (aka consumption units). This must make sense! Occam's Razor applies. Meter and measure these with the most granular labels possible +- Consider tenancy and clustering assumptions carefully: Be in a position to hold up customer usage (measured in consumption units) against an aggregate metric of your underlying cost (measured in internal cost units) +- The above constitute an exchange rate that must provide a clearer picture of your economics + +# Don't reinvent the wheel - build on great platform primitives + +Tech-choices are hard, our biases notwithstanding. Unless you're building specifically for a totally different class of problems, you're likely to be building on the public cloud. While cloud providers seek to provide differentiated offerings, there is a risk of lock-in. Don't get us worng -- there are very many benefits in these offerings and the vertically integrated experience provided by the hyperscaler is very compelling; However, it does decrease your leverage in potentially migrating elsewhere when push comes to shove. This is also materially significant for those building with a multi-cloud, hybrid-cloud offering in the first place. + +It makes eminent sense to balance such risks pragmatically. + +- Standardize on mature open-source +- Adopt ubiquitous tools, that have become platform interfaces by themselves +- Adopt something conforms to both the above, with a managed service provided by the cloud (as opposed to a completely propreitary offering) + +# BYOC is critical in a multi-cloud hybrid-cloud landscape + +Based on the nature of your product, markets you want to cater to and your tenancy model, you must consider + +- Whether your solution may ever need to be deployed outside your choice of cloud provider (including on-premise, ex: for regulated domains) +- Data sovereignity and legal considerations for markets you cater to +- If a cloud provider marketplace could itself be a GTM channel + +In all these models, it makes sense to think about tenanting the customer around clusters of availability, along with freedom to choose the cloud provider. For sufficiently complex products, a control-plane data-plane separation of concerns clearly emerges; And although a cloud provider choice for a control plane need not be within customer discretion, it would most times make sense for the control plane to be colocated in the same provider. + +Finally, hybrid cloud is a reality. If you need to support on-premise, the most unreasonably effective way is to standardize on Kubernetes as the underlying chassis, an architecture choice that would work resonate with all the other points we've made. + +# Think ecosystems + +The maker's mind is fundamentally creative. There is always an urge to capture the greatest part of the value chain. However, one must weigh the costs, efforts and the RoI of building (and overreaching in the process); + +The SaaS offering is fundamentally centred around owning the critical path of some user experience around a business problem. A PaaS offering is centred around the developer experience of a specific technology. No matter what the tierage of your offering is, there will alwyas likely be areas which could be catered to by someone else with a (even if marginal) competitive advantage. + +SaaS companies must therefore be brutally honest about areas that are core, adjacennt or peripheral to their business. A partner ecosystem can be a very effective leverage to creating an ecosystem and thus compounding the scale of your own business while improving integration and customer stickiness. It is crucial to think about: + +- Your ecosystem proposition: +- What is in it for your partners (particularly makers or developers) - typically converging to a revenue sharing goal +- API Strategy as part of your product, focused around key integrations +- Extension interfaces through plugins, that run in sandboxed runtimes provided by you (consider Web Assembly) +- Ecosystem fairness: Ensure you don't compete for the market share that the partner network competes for + diff --git a/_posts/2023-03-17-Faust_101.md b/_posts/2023-03-17-Faust_101.md new file mode 100644 index 0000000000..3c6a23448d --- /dev/null +++ b/_posts/2023-03-17-Faust_101.md @@ -0,0 +1,398 @@ +--- +layout: post +title: "Faust 101 - Stream processing" +author: ashwin +categories: [ Faust, Stream processing, Data, Kafka, Kafka Streams ] +image: assets/blog-images/faust_101_blog/FaustLogo.png +teaser: A quick beginners walkthorugh on Faust - python stream processing library +toc: true +--- + +## Introduction + +Stream processing is a data processing technique to analyze, aggregate, filter or enhance a continuously ingested data as they come. Stream processing is used synonymously with real time data processing as the input data is processed as they are ingested with minimal latency. Some of its applications include fraud detection, IoT device data streaming, event streaming etc. Stream processing involves a publisher / source which generates data to a stream processing application where the data is processed, and the results sent to a subscriber / sink. + +Kafka Streams is one of the best streaming applications to do distributed real time stream processing. Although it's a great tool to do real time stream processing, actually setting up a production level stream processing pipeline is not an easy task. There is a learning curve associated with designing effective Kafka Streams Applications. Kafka Streams API is mainly written in Java and Scala but there are python implementations like Faust which can also be used. + +## What is Faust? + +Faust is a stream processing library written in Python which uses the capabilities of Kafka Streams underneath. It is used at [Robinhood](http://robinhood.com/) to build high performance distributed systems and real-time data pipelines that process billions of events every day. The original [repo](https://github.com/robinhood/faust) which has around 6.5k stars is currently deprecated but it has been [forked](https://github.com/faust-streaming/faust) and actively maintained by community. + + +For reliability Faust uses a Kafka topic as "write-ahead-log”. Popular libraries like Pandas, NumPy, SQL Alchemy etc. can be used in conjunction with Faust. It uses a superfast embedded database (written in C++) called RocksDB to store states during stream processing. + +Following are the concepts used by Faust, + +- App - an instance of the Faust application. +- Agent - an asynchronous, distributed processor to process continuous data. +- Table - sharded dictionaries to enable stateful stream processing. +- Channel - a buffer/queue used to send and receive messages. +- Sensors - To monitor and record events in the Faust application. + +Faust stores internal data like application configurations, table data etc. in Kafka topics to facilitate distributed, fault-tolerant and durable data processing across machines. In general, Faust inputs data from topics, process it and write output to topic again. Faust supports all the operations like group by, map, filter, joins etc. which are supported by Kafka Streams. + +## Faust vs Kafka Streams + +Some of the key differences between Faust and Kafka Streams are as follows: + +- Faust is written in Python, while Kafka Streams is written in Java. +- Python has better support for data engineering and machine learning libraries like Pandas, Scikit-learn etc. which can be effectively utilized by Faust. +- Kafka Streams uses a Domain Specific Language(DSL) whereas Faust is just Python. +- Faust provides a simpler, higher-level API that is easier to use than Kafka Streams' lower-level API. If you are new to stream processing or prefer a simpler API, Faust may be a better choice. +- The concept of “agents” comes from the actor model, and means the stream processor can execute concurrently on many CPU cores, and on hundreds of machines at the same time. +- Thanks to Faust and asyncio you can now embed your stream processing topology into your existing asyncio/ eventlet/Twisted/Tornado applications. No need for complex topologies as it can be simplified with just python. + +Both Kafka Streams and Faust are tightly integrated with Kafka and can leverage Kafka's features such as availability, concurrency and fault tolerance. + +In essence, Faust is a good starting point for anyone who wants to learn how Kafka stream processing systems work. It is a better option for data analytics or data science use cases. + +## Prerequisites + +Our objective here is to write a simple Kafka streams application in python using the Faust package. We will leverage the Confluent cloud infrastructure to create some topics. Confluent provides a fully managed cloud service for Apache Kafka which is super easy to deploy and use. You can sign up for Confluent cloud [here](https://www.confluent.io/confluent-cloud/tryfree/) and earn free credits worth $400 for the first 60 days. + +For this tutorial purposes, we will be using an open-source time-series based [power consumption dataset](https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set) which is available in Kaggle. Measurements in the dataset are of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. We will look at only the voltage parameter and find its average for every 1-hour windows. These windowed averages can tell us if there is a power spike which needs to be looked at. + +## Step 1: Create a topic in Confluent Cloud + +Once you have signed up for a confluent cloud account, you can create a basic cluster with default configuration at the region of your liking. + +![Image-1](../assets/blog-images/faust_101_blog/faust_blog_1.png) + +Navigate to topics and create a topic with default configurations called `voltage_input`. + +## Step 2: Downloading and installing requirements + +Let’s create a python client to produce data to the topic. Navigate to ‘Clients’ under the ‘Data integration’ option on the cluster dashboard and create a new python client. + +![Image-2](../assets/blog-images/faust_101_blog/faust_blog_2.png) + +Generate api key and copy the configuration snippet. Save it in the following path, + +```bash +$HOME/.confluent/python.config +``` + +We will use the `confluent-kafka` python package to produce data to the input topic from local. Create a virtual environment with the following packages, + +```plain-text +requestscertifi +confluent-kafka[avro,json,protobuf]>=1.4.2 +faust[fast] +``` + +Let’s download the python file which will read the config file and parse it for the application to use. + +```bash +wget https://raw.githubusercontent.com/confluentinc/examples/6.1.1-post/clients/cloud/python/ccloud_lib.py +``` + +Download the dataset file `household_power_consumption.txt` from the Kaggle [link](https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set) and save it in our working directory. + +## Step 3: Produce to the input topic + +We will use the `ccloud_lib` python module to parse the config file and also the input arguments for the producer module. + +`producer.py` + +```python +#!/usr/bin/env python + +import json +import ccloud_lib + +delivered_records = 0 + +# Callback function when a message has been successfully delivered +def acked(err, msg): + global delivered_records + """Delivery report handler called on + successful or failed delivery of message + """ + if err is not None: + print("Failed to deliver message: {}".format(err)) + else: + delivered_records += 1 + print("Produced record to topic {} partition [{}] @ offset {}" + .format(msg.topic(), msg.partition(), msg.offset())) + +def fetch_time_series_data(count=1): + data_split = data[count].split(";") + try: + new_message = {"datetime": datetime.strptime(data_split[0] + " " + data_split[1], "%d/%m/%Y %H:%M:%S"), "voltage": float(data_split[4])} + except ValueError: + new_message = {"datetime": datetime.strptime(data_split[0] + " " + data_split[1], "%d/%m/%Y %H:%M:%S"), "voltage": 0.0} + return count+1,new_message + +if __name__ == '__main__': + + # Read arguments and configurations and initialize + args = ccloud_lib.parse_args() + config_file = args.config_file + topic = args.topic + conf = ccloud_lib.read_ccloud_config(config_file) + + # Create topic if needed + # ccloud_lib.create_topic(conf, topic) + + # Initialize the producer configuration + producer_conf = ccloud_lib.pop_schema_registry_params_from_config(conf) + producer_conf['key.serializer'] = StringSerializer('utf-8') + producer_conf['value.serializer'] = StringSerializer('utf-8') + p = SerializingProducer(producer_conf) + + with open("./household_power_consumption.txt", "r") as f: + data = f.readlines() + + no_of_records = 10000 + start_index = 0 + while no_of_records: + # Call poll to send any queued record + p.poll(0) + start_index,base_message = fetch_time_series_data(start_index) + no_of_records -= 1 + + value = base_message + + record_key = "power" + record_value = json.dumps(value) + + p.produce(topic, key=record_key, value=record_value, on_delivery=acked) + + # Call flush to send any queued record at the end + p.flush() +``` + +We are producing the voltage data with a datetime column to the input topic. + +`poll()` or `flush()` needs to be called to send the queued data to the Kafka broker. + +## Step 4: Creating a Faust app + +Let’s create a simple Faust app which will read from the `voltage_input` topic and do a windowed aggregation of the voltage values. + +First, we need to define the input and output data models for the Faust app. + +```python +# Input model +class PowerModel(faust.Record, coerce=True, date_parser=parse_date): + datetime: datetime + voltage: float + +#Output model +class AverageVoltage(faust.Record): + start_time: datetime + count: int + mean: float +``` + +Let’s initialize the faust app with the kafka broker connection details. We need to pass in the SASL credentials which were created previously and stored in the config file. + +```python +ssl_context = ssl.create_default_context() +ssl_context.load_verify_locations(cafile=certifi.where()) + +app = faust.App('myapp', broker='kafka://BROKER_HOST_NAME:9092', broker_credentials=faust.SASLCredentials(username=SASL_USERNAME, password=SASL_PASSWORD, ssl_context=ssl_context), topic_replication_factor=3, topic_partitions=3) +``` + +Define the input topic stream and output sink topic for the aggregated average voltages. + +```python +voltage_stream = app.topic('voltage_input', key_type=str, value_type=PowerModel) +sink = app.topic('voltage_avg', value_type=AverageVoltage) +``` + +Next, we need to define the aggregation logic for calculating the hourly average voltage. + +```python +def hourly_window_processor(key, events): + timestamp = key[1][0] + timestamp = datetime.fromtimestamp(timestamp) + values = [event.voltage for event in events] + count = len(values) + mean = sum(values) / count + + logger.info( + f'processing window:' + f'{len(values)} events,' + f'mean: {mean:.2f},' + f'timestamp {timestamp}', + ) + + sink.send_soon(key='avg_voltage_hourly', value=AverageVoltage(start_time=timestamp, count=count, mean=mean)) + +hourly_window_voltage = app.Table('hourly_average_voltage', default=list, partitions=3, on_window_close=hourly_window_processor)\ + .tumbling(timedelta(hours=1), expires=30)\ + .relative_to_field(PowerModel.datetime) +``` + +We are using a tumbling window here as we need to calculate the average every hour. Values are stored in the transition table `hourly_average_voltage` as it comes. Once the time window closes, the callback function calculates the average for the values in that window and sinks it to the output topic. + +Now, we need to define an app agent to append the input data stream to the hourly window voltage table. + +```python +@app.agent(voltage_stream) +async def processor(records): + async for record in records: + value_list_hour = hourly_window_voltage['events'].value() + value_list_hour.append(record) + hourly_window_voltage['events'] = value_list_hour +``` + +Following is the final aggregate code of the faust app, + +`app.py` + +```python +import faust +from datetime import datetime, timedelta +from dateutil.parser import parse as parse_date +import ssl +import certifi +import logging + +logger = logging.getLogger(__name__) + +ssl_context = ssl.create_default_context() +ssl_context.load_verify_locations(cafile=certifi.where()) + +class AverageVoltage(faust.Record): + start_time: datetime + count: int + mean: float + +class PowerModel(faust.Record, coerce=True, date_parser=parse_date): + datetime: datetime + voltage: float + +app = faust.App('myapp', broker='kafka://pkc-7prvp.centralindia.azure.confluent.cloud:9092', broker_credentials=faust.SASLCredentials(username="3TCIKXFSRGTFZCMV", password="7OUQQoTN6zIA2gS5LK/hrAxQjpRjIhw+w9+/gdoUJZT+8vlPxEtKkJ+n+X1cPkpm", ssl_context=ssl_context), topic_replication_factor=3, topic_partitions=3) +app.conf.table_cleanup_interval = 1.0 + +voltage_stream = app.topic('test3', key_type=str, value_type=PowerModel) +sink = app.topic('voltage_avg', value_type=AverageVoltage) + +def hourly_window_processor(key, events): + timestamp = key[1][0] + timestamp = datetime.fromtimestamp(timestamp) + values = [event.voltage for event in events] + count = len(values) + mean = sum(values) / count + + logger.info( + f'processing window:' + f'{len(values)} events,' + f'mean: {mean:.2f},' + f'timestamp {timestamp}', + ) + + sink.send_soon(key='avg_voltage_hourly', value=AverageVoltage(start_time=timestamp, count=count, mean=mean)) + +hourly_window_voltage = app.Table('hourly_average_voltage', default=list, partitions=3, on_window_close=hourly_window_processor)\ + .tumbling(timedelta(hours=1), expires=30)\ + .relative_to_field(PowerModel.datetime) + +@app.agent(voltage_stream) +async def processor(records): + async for record in records: + value_list_hour = hourly_window_voltage['events'].value() + value_list_hour.append(record) + hourly_window_voltage['events'] = value_list_hour + +if __name__ == '__main__': + app.main() +``` + +## Step 5: Running the app and viewing the results + +We can start the Faust worker in a terminal by executing the following command, + +```bash +python app.py worker -l info +``` + +Let’s open another terminal and start producing records to the `voltage_input` topic by executing the following command. + +```bash +./producer.py -f ~/.confluent/python.config -t test1 +``` + +Let’s write a simple `consumer.py` code to consume the aggregated results from our output topic `voltage_avg`. + +`consumer.py` + +```python +from confluent_kafka import Consumer +import json +import ccloud_lib +import sys +import datetime + +if __name__ == '__main__': + + # Read arguments and configurations and initialize + args = ccloud_lib.parse_args() + config_file = args.config_file + topic = args.topic + conf = ccloud_lib.read_ccloud_config(config_file) + + # Create Consumer instance with a unique group id + # 'auto.offset.reset=earliest' to start reading from the beginning of the + # topic if no committed offsets exist + consumer_conf = ccloud_lib.pop_schema_registry_params_from_config(conf) + consumer_conf['group.id'] = 'python_example_group_1' + consumer_conf['auto.offset.reset'] = 'earliest' + consumer = Consumer(consumer_conf) + + # Subscribe to topic + consumer.subscribe([topic]) + + # Process messages + total_count = 0 + try: + while True: + msg = consumer.poll(1.0) + if msg is None: + print("Waiting for message or event/error in poll()") + continue + elif msg.error(): + print('error: {}'.format(msg.error())) + else: + # Check for Kafka message + record_key = msg.key() + record_value = msg.value() + data = json.loads(record_value) + total_count += 1 + print(data) + except KeyboardInterrupt: + pass + finally: + # Leave group and commit final offsets + consumer.close() +``` + +Run the following command to start the consumer in terminal, + +```bash +./consumer.py -f ~/.confluent/python.config -t voltage_avg +``` + +Now, you can view the magic by opening three terminals and running the producer, Faust app worker and consumer simultaneously in them. + +## Conclusion + +In this tutorial, we were able to accomplish the following, + +1. Create a simple Faust application to calculate hourly windowed average of voltages from our input dataset. +2. Create a simple python producer and consumer to interact with a Confluent cloud topic. + +Faust is great way to leverage the capabilities of Kafka Streams in conjunction with popular python libraries like Pandas, NumPy, Scikit-learn etc. to write distributed, fault-tolerant streaming applications. Faust is simple and can be used to build real-time applications involving machine learning, deep learning etc. + +Some of the other python coded stream processing library include, +- https://github.com/wintoncode/winton-kafka-streams +- https://github.com/bytewax/bytewax +- https://github.com/rh-marketingops/fluvii + +## References + +1. [https://faust.readthedocs.io/en/latest/userguide/tables.html](https://faust.readthedocs.io/en/latest/userguide/tables.html) +2. [https://github.com/robinhood/faust/blob/master/examples/windowed_aggregation.py](https://github.com/robinhood/faust/blob/master/examples/windowed_aggregation.py) +3. [https://github.com/confluentinc/examples/tree/7.1.0-post/clients/cloud/python](https://github.com/confluentinc/examples/tree/7.1.0-post/clients/cloud/python) diff --git a/_posts/2023-03-24-Kafka_batch_processing_with_Airflow.md b/_posts/2023-03-24-Kafka_batch_processing_with_Airflow.md new file mode 100644 index 0000000000..b297c8be5c --- /dev/null +++ b/_posts/2023-03-24-Kafka_batch_processing_with_Airflow.md @@ -0,0 +1,460 @@ +--- +layout: post +title: "Kafka batch processing with Airflow" +author: ashwin +categories: [ Airflow, Batch processing, Data, Kafka, Kafka Consumer, Docker] +image: assets/blog-images/batch_processing_blog/KafkaBatchProcessing.png +teaser: Batch processing from Kafka Topics by using Apache Airlfow DAG +toc: true +--- + +## Introduction + +In the world of big data, managing large datasets is crucial. Batch processing is the process of running a large set of data through a particular program or pipeline at one time. In this blog post, we will discuss how Kafka and Airflow can be used for batch processing. + +Kafka is a distributed streaming platform which uses logs as the unit of storage for messages passed within the system. Kafka has a unique architecture that allows it to handle high volume data streams in real-time. Event streaming enables real time data interpretation and processing to effect near real time decisions to improve efficiency and optimize costs in businesses. Kafka offers a distributed, highly scalable, elastic, fault-tolerant, and secure infrastructure for real time event streams. Kafka is used in a wide variety of use cases like processing real time transactions in banks, real time analysis of sensor data from IoT devices etc. + +Airflow is an open source orchestration tool to manage big data workflows. Workflows are programmatically authored as Directed Acyclic Graphs (DAGs) of tasks. The Airflow scheduler executes your tasks on an array of workers while following the specified dependencies. Airflow pipelines are defined as Python code, which means that anything that you can do with Python you can do with Airflow. It also provides a variety of operators and hooks which helps us to integrate seamlessly with most of the cloud and open-source services. Airflow is designed as a configuration-as-a-code system and it can be heavily customized with plugins. + +## Why Airflow in Batch processing pipeline? + +Although real time stream processing is used with Kafka mostly, there are a lot of instances where batch processing is required. For example, + +- Online stores might be interested in finding out the hourly or daily aggregate of the number of unique users arriving through a particular advertising channel to optimise their ad spends. +- Banks settling the accumulated bills to vendors at the end of day based on daily transactions. + +![Image-1](../assets/blog-images/batch_processing_blog/airflow_blog_1.png) + +Airflow is an ideal tool to orchestrate and manage batch processing of logs from Kafka topics as it run on schedules, so the resource usage is significantly lower when compared to Kafka streams or any streaming application. Airflow allows us to decouple big tasks into smaller ones and manage the dependencies between them using DAGs. It also allows us to monitor the status of tasks, re-run or restart tasks from any given point in a workflow in case of failure. Airflow also lets you integrate with tools like Apache Spark etc. to execute the task. + +## Kafka consumer partition assignment strategy + +In general, Kafka consumers are identified by a consumer group id i.e. all the consumer having the same group id will share the subscribed `TopicPartitions` among them. When a consumer is added or removed from the group, a consumer re-balance is triggered which completely reassigns `TopicPartitions` among the new set of consumers. There is no guarantee that the consumer-partition mapping will remain the same before and after re-balance. To read about more on the different partition assignment strategies, please visit this [link](https://www.conduktor.io/blog/kafka-partition-assignment-strategy/). + +To control the consumer-partition mapping, we need to manually assign a `TopicPartition` to a consumer in the group. This can be achieved by using `consumer.assign()` command. Manual partition assignment can be useful in cases where different processing logic are applied for different partitions, assigning dedicated consumers to heavy throughput partitions etc. In this example, we will use `consumer.assign` to consume from a specific partition, process the data and produce the values to the same output topic partition as input topic. + +## Airflow Dynamic Task Mapping + +Dynamic task mapping enables us to create parallel tasks in DAG at run time. It facilitates running parallel tasks equal to the number of input Kafka topic partitions. In Kafka, maximum parallelisation is defined by the number of topic partitions. The expand() command is used to dynamically map tasks in Airflow. For more information, visit this [link](https://airflow.apache.org/docs/apache-airflow/2.3.0/concepts/dynamic-task-mapping.html). + +![Image-2](../assets/blog-images/batch_processing_blog/airflow_blog_2.png) + +We will use manual partition assignment with consumers rather than group based partition assignment. This is to ensure that data can be consumed from an input topic partition, processed and produced to an output topic partition same as the input topic partition in parallel tasks. For the above logic to hold true, the input and output topic should be created with the same number of partitions. Consumer re-balancing in group based partition assignment can lead to mix up while processing data in parallel tasks. + +## Step 1: Download and setup Airflow + +For this tutorial purposes, we will run airflow in docker. For Production, Airflow supports helm charts for Kubernetes deployment. We will be following this [link](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html) to set up Airflow in docker. + +1. Open a terminal window and navigate to the working directory. + + ```bash + cd airflow_kafka + ``` + +2. Fetch the `docker-compose.yml` file, + + ```bash + curl -LfO   'https://airflow.apache.org/docs/apache-airflow/2.3.2/docker-compose.yaml' + ``` + +3. Create the required folders for docker compose and set the airflow uid (for Linux), + + ```bash + mkdir -p ./dags ./logs ./plugins ./results + echo -e "AIRFLOW_UID=$(id -u)" > .env + ``` + + +We have created an extra folder called results to store topic offset locally. + +Next, we need Kafka python libraries which are not available in the default airflow image. So, let’s build our custom image with the required dependencies, + +1. Create a `requirements.txt` file with the required packages, + + ``` + confluent-kafka + ``` + +2. Create a `Dockerfile` to build custom image, + + ```docker + FROM apache/airflow:2.3.0 + USER root + RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + vim \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + USER airflow + COPY requirements.txt . + RUN pip install --no-cache-dir -r requirements.txt + ``` + +3. In `docker-compose.yml`, comment the image option and comment out the build option, + + ```yaml + version: '3' + x-airflow-common: + &airflow-common + # In order to add custom dependencies or upgrade provider packages you can use your extended image. + # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml + # and uncomment the "build" line below, Then run `docker-compose build` to build the images. + # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.3.0} + build: . + environment: + ...... + ``` + + +1. Also, add results to the docker container volumes, + + ```yaml + volumes: + - ./dags:/opt/airflow/dags + - ./logs:/opt/airflow/logs + - ./plugins:/opt/airflow/plugins + - ./results:/opt/airflow/results + ``` + +2. Run database migrations and create the first user account, + + ```bash + docker-compose up airflow-init + ``` + +3. Now the Airflow is set up and can be started by executing, + + ```bash + docker-compose up -d + ``` + + +Airflow web UI can be accessed at [http://localhost:8080/](http://localhost:8080/) + +## Step 2: Setup the Kafka Server config file + +Before we start writing dags, let’s create the config file to access the input topic in a Kafka cluster. This tutorial assumes you have basic Kafka knowledge and already have an input Kafka topic set up in a remote or local Kafka cluster. Here, we will use an existing topic in the Confluent Cloud as the input topic. The client config file to connect to Kafka cluster will look like the following, + +`kafka_server.config` + +``` +bootstrap.servers=: +security.protocol=SASL_SSL +sasl.mechanisms=PLAIN +sasl.username= +sasl.password= +``` + +In addition to the above, we can also define producer and consumer specific properties. It would be ideal to have separate config files for producer and consumer. Place the `kafka_server.config` file inside the plugins folder to make it available inside the container. + +The input topic here contains the following as message value, + +```json +{  + "datetime": "23/12/2006 16:52:00", + "voltage": 239.94 +} +``` + +## Step 3: Define DAG parameters + +Let’s create the `dag.py` inside the `/dags` folder. Airflow automatically reads dags defined in the `/dags` folder. + +1. Import the required modules, + + ```python + from datetime import datetime, timedelta + import os + import pendulum + import pandas as pd + import csv + import json + from confluent_kafka import Consumer, TopicPartition, Producer + from confluent_kafka.serialization import StringSerializer + from confluent_kafka.admin import AdminClient + from airflow.decorators import dag, task + ``` + +2. Function to read and parse the config file, + + ```python + def read_ccloud_config(config_file): + """Read Confluent Cloud configuration for librdkafka clients""" + + conf = {} + with open(config_file) as fh: + for line in fh: + line = line.strip() + if len(line) != 0 and line[0] != "#": + parameter, value = line.strip().split('=', 1) + conf[parameter] = value.strip() + + return conf + ``` + +3. A producer callback function to print the delivered status, + + ```python + delivered_records = 0 + + # Callback function when a message has been successfully delivered + def acked(err, msg): + global delivered_records + """Delivery report handler called on + successful or failed delivery of message + """ + if err is not None: + print("Failed to deliver message: {}".format(err)) + else: + delivered_records += 1 + print("Produced record to topic {} partition [{}] @ offset {}" + .format(msg.topic(), msg.partition(), msg.offset())) + ``` + +4. Define the global variables to be used in the dag tasks and the dag configurations. The input values are hard-coded in this example. The input values can be given while triggering the dag as input parameters as well. + + ```python + # Input Values + topic_config_file = "/opt/airflow/plugins/kafka_server.config" + topic_name = "airflow-input-topic" + output_topic_name = "airflow-output-topic" + # Local file to save latest offsets for topic partitions + local_offset_file = "/opt/airflow/results/topic-offsets-latest.csv" + consumer_conf = read_ccloud_config(topic_config_file) + producer_conf = read_ccloud_config(topic_config_file) + # Create an empty csv file with headers if not available + if not os.path.exists(local_offset_file): + with open(local_offset_file, "w+") as file: + csvwriter = csv.writer(file) + csvwriter.writerow(["id", "topic", "partition", "last_offset"]) + + # Define the DAG configuration for the DAG() function with the @dag decorator, + @dag( + dag_id="{}_batch_processor".format(topic_name), + schedule_interval='@daily', + start_date=pendulum.datetime(2022, 5, 1), + catchup=False, + dagrun_timeout=timedelta(minutes=60), + ) + def DAG(): + ``` + + +The config `catchup=False` is set up to avoid DAG to run missed scheduled runs. + +## Step 4: Fetch partition ids for the given topic + +Let’s start building the DAG by defining each sequential task. All the tasks should be defined inside the `DAG()` function with `@task` decorator. + +Initial task is to fetch the partition ids for the given topic, + +```python +def DAG(): + @task + def get_topic_partitions(): + kadmin = AdminClient(conf) + partitions = kadmin.list_topics(topic=topic_name).topics[topic_name].partitions + partition_ids = [] + for partition in partitions: + partition_ids.append(partition) + # print(partition_ids) + return partition_ids +``` + +Kafka Admin Client is used to fetch the topic partitions. + +## Step 5: Consume new data for each partition + +Next task is to consume the message for each given topic partition till the latest offset from the previously committed offset. + +```python +@task +def consume_from_topic_partition(partition_id): + consume_skip = False + # Transient file to store the consumed messages + filename = "/opt/airflow/results/raw/%s_%s.csv" % (topic_name, partition_id) + if not os.path.exists(filename): + with open(filename, 'a+') as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["key", "datetime", "voltage", "offset"]) + # Setup consumer related config + consumer_conf["auto.offset.reset"] = "earliest" + consumer_conf["group.id"] = "airflow-group" + consumer = Consumer(consumer_conf) + + # Get the earliest and latest offset for a topic partition + low, high = consumer.get_watermark_offsets(TopicPartition(topic_name, partition_id)) + + high_offset = high - 1 + + print(high_offset) + + # Load the local offset file as dataframe + local_offset_db = pd.read_csv(local_offset_file, index_col=[0]) + + last_offset_row = local_offset_db.loc[(local_offset_db["topic"] == topic_name) & (local_offset_db["partition"] == partition_id), ["last_offset"]] + + if len(last_offset_row) == 0: + # If the row for the topic partition does not exist, create one. + with open(local_offset_file, "a+") as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["%s_%s" % (topic_name, partition_id) , topic_name, partition_id, 0]) + consumer.assign([TopicPartition(topic_name, partition_id, low)]) + else: + last_offset_value = last_offset_row["last_offset"].values[0] + # Skip consumer if last committed offset is the same as latest offset as now new messages to consume + if last_offset_value == high_offset: + consume_skip = True + consumer.assign([TopicPartition(topic_name, partition_id, last_offset_value + 1)]) + + if consume_skip: + print("Already consumed till latest offset") + consumer.close() + else: + total_count = 0 + try: + while True: + msg = consumer.poll(1.0) + if msg is None: + # No message available within timeout. + # Initial message consumption may take up to + # `session.timeout.ms` for the consumer group to + # rebalance and start consuming + print("Waiting for message or event/error in poll()") + continue + elif msg.error(): + print('error: {}'.format(msg.error())) + else: + total_count += 1 + value = json.loads(msg.value()) + value["datetime"] = datetime.strptime(value["datetime"], "%d/%m/%Y %H:%M:%S") + value["voltage"] = float(value["voltage"]) + with open(filename, 'a') as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow([msg.key(), value["datetime"], value["voltage"], msg.offset()]) + # Break loop if message offset is the latest offset + if msg.offset() == high_offset: + break + except KeyboardInterrupt: + pass + finally: + print(total_count) + # Leave group and commit final offsets + consumer.close() + return partition_id, high_offset +``` + +## Step 6: Commit latest offset for each topic partition locally + +Next task is to commit the latest consumed offset to the local offset file. Partition Id and latest offset are taken as input from the previous task. Partition ids are passed to the next task. + +```python +@task +def commit_offsets_locally(partition_offset_values): + partition_ids = [] + local_offset_db = pd.read_csv(local_offset_file, index_col=[0]) + for value in partition_offset_values: + print(value) + partition_ids.append(value[0]) + local_offset_db.loc[(local_offset_db["topic"] == topic_name) & (local_offset_db["partition"] == value[0]), "last_offset"] = value[1] + local_offset_db = local_offset_db.sort_index() + local_offset_db.to_csv(local_offset_file) + return partition_ids +``` + +## Step 7: Aggregate values for each partition + +Next task is to aggregate the values for the consumed messages. We will calculate the average voltage for each unique date and store them in a transient processed-csv file. + +```python +@task +def process_topic_messages(partition_id): + messages_db = pd.read_csv("/opt/airflow/results/raw/%s_%s.csv" % (topic_name, partition_id), parse_dates=["datetime"]) + daily_voltage_avg = messages_db[["datetime", "voltage"]].set_index("datetime").groupby(pd.Grouper(freq='D')).mean().reset_index() + daily_voltage_avg.columns = ["date", "avg_voltage"] + # Store in a transient processed file + daily_voltage_avg.to_csv("/opt/airflow/results/processed/%s_%s.csv" % (topic_name, partition_id)) + return partition_id +``` + +## Step 8: Produce aggregated value to output topic + +Next task is to produce the aggregated values to the output topic. Date wise average voltage values are produced to the same partition as the input topic. So, the input and output topics are co-partitioned. + +```python +@task +def produce_processed_data_to_output_topic(partition_id): + # Add producer related configuration + producer_conf['key.serializer'] = StringSerializer('utf-8') + producer_conf['value.serializer'] = StringSerializer('utf-8') + p = Producer(producer_conf) + with open("/opt/airflow/results/processed/%s_%s.csv" % (topic_name, partition_id), 'r') as file: + reader = csv.reader(file) + for row in reader: + if 'date' in row: + continue + p.poll(0) + record_key = row[1] + record_value = json.dumps({"date": str(row[1]), "voltage": row[2]}) + p.produce(output_topic_name, key=record_key, value=record_value, partition=partition_id, on_delivery=acked) + p.flush() + return "Finished producing" +``` + +## Step 9: Define the task flow for the DAG + +Next step is to connect the defined tasks in the appropriate order for the dag to execute. The `expand()` command is used to execute tasks in parallel and simultaneously. + +```python +partitions_latest_offset_list = consume_from_topic_partition.expand(partition_id=get_topic_partitions()) +partitions_list = commit_offsets_locally(partitions_latest_offset_list) +partitions_list = process_topic_messages.expand(partition_id=partitions_list) +produce_processed_data_to_output_topic.expand(partition_id=partitions_list) +``` + +Finally, call the `DAG()` function to execute the dag. + +```python +dag = DAG() +``` + +Executing the dag does not trigger the dag. A dag is triggered based on the defined schedule and can also be triggered manually in web UI and rest API endpoint. + +Now, run the `docker-compose up -d`  command and head to the link [http://localhost:8080](http://localhost:8080/)/. + +Default username and password for the airflow web UI are, + +``` +Username: airflow +Password: airflow +``` + +Once you log in, you should be able to see the defined DAG in the web UI which should have the same name as the `dag_id` we defined in the DAG configuration. + +![Image-3](../assets/blog-images/batch_processing_blog/airflow_blog_3.png) + +We can now trigger the **`airflow-input-topic_batch_processor`** by clicking the play button under the `Actions` section. + +We can track the dag status in Graph or Grid mode in the dag page. Also, we can see the Logs, XCom etc. for each dag task like below. + +![Image-4](../assets/blog-images/batch_processing_blog/airflow_blog_4.png) + +## Conclusion + +In this tutorial, we were able to do the following, + +1. Build custom Airflow docker image with required python dependencies. +2. Run Airflow in docker compose with a custom docker image. +3. Dynamic task mapping in Airflow to run parallel tasks. +4. Produce to and consume from a Kafka topic per partition. +5. Input and Output Kafka topics are co-partitioned. + +In conclusion, Kafka and Airflow are two of the most popular open-source tools that are used for batch processing. Kafka is designed to handle high-volume data streams in real-time, while Airflow is designed to orchestrate complex workflows and data processing pipelines. Using Kafka and Airflow together can provide a scalable and fault-tolerant solution for batch processing large datasets. + +## References + +1. [https://airflow.apache.org/docs/apache-airflow/2.3.0/start/docker.html](https://airflow.apache.org/docs/apache-airflow/2.3.0/start/docker.html) +2. [https://airflow.apache.org/docs/apache-airflow/2.3.0/concepts/dynamic-task-mapping.html](https://airflow.apache.org/docs/apache-airflow/2.3.0/concepts/dynamic-task-mapping.html) +3. [https://airflow.apache.org/docs/docker-stack/build.html](https://airflow.apache.org/docs/docker-stack/build.html) +4. [https://github.com/confluentinc/examples/tree/7.1.0-post/clients/cloud/python](https://github.com/confluentinc/examples/tree/7.1.0-post/clients/cloud/python) diff --git a/_posts/2023-03-30-Command_Processing_and_Event_Driven_Workflows_with_Apache_Kafka_and_Airflow.md b/_posts/2023-03-30-Command_Processing_and_Event_Driven_Workflows_with_Apache_Kafka_and_Airflow.md new file mode 100644 index 0000000000..23c9b0aa58 --- /dev/null +++ b/_posts/2023-03-30-Command_Processing_and_Event_Driven_Workflows_with_Apache_Kafka_and_Airflow.md @@ -0,0 +1,451 @@ +--- +layout: post +title: "Command Processing and Event Driven Workflows with Apache Kafka and Airflow" +author: ashwin +categories: [ Airflow, Command processing, Event Driven Workflows, Kafka, Kafka Consumer, Docker] +image: assets/blog-images/command_processing_blog/EventDrivenArchitecture.png +featured: false +hidden: false +teaser: Reactive Command processing in real time using Apache Kafka and by utilising Workflow architecture in Apache Airflow +toc: true +--- + +## Introduction + +In today's world, data processing has become an essential part of businesses. With the rise of big data, companies are looking for efficient ways to process data quickly and accurately. Apache Kafka and Apache Airflow are two popular technologies used for this purpose. + +Apache Kafka is a distributed messaging system designed to handle large amounts of data efficiently. It can be used as a real-time stream processing platform, and it is widely used for its ability to handle high throughput and low latency. Kafka is a popular choice for building event-driven architectures because of its persistence and scalability. + +Apache Airflow, on the other hand, is an open-source platform used for managing and scheduling complex data pipelines. Airflow allows users to define and execute workflows, making it easier to manage and monitor data pipelines. Airflow boasts a wide range of operators for various types of operations and integration with third party applications. + +## Command Processing and Event Driven Workflows + +Command processing refers to a software design pattern in which an application receives and executes commands from users or other systems. In general, the application receives a command and then executes it in a synchronous manner, meaning that it waits for the command to complete before executing the next one. Command processing is simpler to implement and provides greater control over the order in which tasks are executed, making it well-suited for applications where the user is in direct control of the system. + +On the other hand, event-driven workflows are based on the idea of reacting to events that occur within a system, rather than waiting for a command to be executed. This pattern is often used in complex systems, such as distributed systems, to ensure that actions are taken in response to specific events, without the need for manual intervention. Event-driven workflows are more flexible and can adapt to changing conditions in real-time, making them ideal for systems that require a high degree of automation or respond to external events. + +But in reality, in most cases,the commands need to processed in real-time to quickly inform the dependent systems to effect the appropriate action derived from the command. By combining command processing with the event driven architecture, we can process commands asynchronously in real-time and trigger actions derived from the command. Let’s see how to achieve this with the help of Apache Kafka and Apache Airflow. + +## Command Processing in Apache Kafka + +Kafka is designed to handle high volumes of data in real-time, making it well-suited for use in complex systems where events are monitored continuously. Some of the capabilities of Kafka which makes it ideal for command processing, + +1. **Persistent:** Kafka is designed to be highly persistent, ensuring that messages are not lost even if there are failures or crashes in the system. +2. **Low latency:** Kafka is optimized for low latency, enabling it to deliver real-time data streams with minimal delay. +3. **System of record:** Kafka can be a centralised system of record for all the events occurred. This also gives us the flexibility to reprocess events in the arrived order in case of failure or requirement. +4. **Fan Out/Parallelism:** Many event-driven apps need fan out = "write once, read many"- that is, multiple applications may be interested in subscribing to and reacting to a single event + +Kafka topics can act as the medium for passing commands and events between different components of the system. In this architecture, commands are sent to a specific topic by a user or external system, and then consumed by a component responsible for processing that particular command. This architecture also allows for the separation of concerns between different components, making it easier to manage and maintain the system over time. + +For example, consider an e-commerce platform where customers can place orders for products. When a customer places an order, a command is sent to a specific Kafka topic, which is consumed by an order processing component. The component then executes the command by creating an order record in a database and generating an event indicating that a new order has been placed. This event is then sent to another Kafka topic, which is consumed by a shipping component responsible for initiating the shipping process. + +## Orchestrating Event Workflows in Apache Airflow + +Many applications process events in step-wise workflows (also known as Staged Event Driven Architecture) -- these workflows need control on the individual steps (including sequencing, dependencies on previous steps, handling failures, parallel processing etc). Airflow to this end is the best orchestration platform for performing these with a purely Python interface (something that developers love). Airflow allows users to define workflows as Directed Acyclic Graphs (DAGs), where nodes represent individual tasks and edges represent dependencies between those tasks. + +Airflow provides a number of key features to support workflow management, including: + +1. **Scheduling and automation:** Airflow allows users to define schedules and automate workflow execution, ensuring that tasks are executed in the correct order and at the right time. It also supports passing on variables from upstream tasks to downstream tasks making dynamic conditional execution possible. +2. **Monitoring and logging:** Airflow provides rich monitoring and logging capabilities, allowing users to track the progress of workflows, diagnose issues, and optimize performance. +3. **Extensibility:** Airflow supports custom operators, sensors, and hooks, allowing users to interact with a wide range of systems and technologies. +4. **Alerting:** Airflow provides built-in alerting capabilities, allowing you to track the progress of your pipeline and receive notifications if anything goes wrong. +5. **Compatibility:** Airflow supports a wide range of data processing tools and frameworks, including Apache Kafka, Apache Spark, and TensorFlow, making it a highly compatible choice for event processing use cases. + +So, Apache Airflow is not just a great tool for both classic data integration & ETL/ELT oriented processing needs (in the analytics plane) but also orchestration of event driven workflows (in the operational plane). + +## Architecture + +A general architecture is as follows, + +1. Initially, the command stream is produced to a Kafka topic from the user or external systems. +2. A command dispatcher service (like a Kafka consumer) handles reading & committing offsets including hand off to Airflow by invoking the concerned DAG (via it's REST interface). By doing this, we therefore separate consuming from processing. +3. Airflow DAG itself is parametrized to process one command / event at a time. This therefore provides granular observability and auditable controls of what happened, thus enabling retrying/re-processing or dead-letter queue. + +![Dispatcher_Architecture.png](../assets/blog-images/command_processing_blog/Dispatcher_Architecture.png) + +This pattern could thus be used for: + +- Analytical data processing closely tied to events (such as CDC) +- Handle patterns such as CQRS with projections (Airflow DAG computes projections) +- Just general event workflows with chained actions (when this happens, the following chain of other events must occur) + +We must note that this architecture while useful, may not suit high concurrency because of the nature of DAG processing for the following reasons, + +- Each DAG run processes only one event at a time (as opposed to a batch) and thus has a large cost. +- There are also consequent impacts on the dispatcher's ability to maintain strict order in the face of DAG run failures. Therefore, processing order cannot be guaranteed in high concurrency mode. + +This architecture is more suited for low-concurrency scenarios requiring granular execution level observability of command processes. + +Now, lets see an example of how to implement the above defined architecture. + +## Prerequisites + +- Kafka cluster running remotely or locally. We will be using a Confluent cloud cluster in this demo. You can easily set up a Confluent cloud cluster by [signing up](https://confluent.cloud/signup) if you have not already. +- Python environment with following packages, + + ```python + confluent-kafka[avro] + ``` + + +## Generate Command Stream + +For the purposes of this demo, we will send the events to a topic using a Kafka Producer python client. A topic called `events_topic` has already been created in advance. + +The client config file to connect to Kafka cluster will look like the following, + +`kafka_server.config` + +``` +bootstrap.servers={{ CLUSTER_LISTENER_URL }}:{{ CLUSTER_LISTENER_PORT }} +security.protocol=SASL_SSL +sasl.mechanisms=PLAIN +sasl.username={{ CLUSTER_API_KEY }} +sasl.password={{ CLUSTER_SECRET_KEY }} +# Required connection configs for Confluent Cloud Schema Registry +schema.registry.url={{ SR_URL }} +basic.auth.credentials.source=USER_INFO +basic.auth.user.info={{ SR_API_KEY }}:{{ SR_API_SECRET }} +``` + +An example message for the `event_topic` is as follows, + +```json +{ + "command" : "app.send_email_alert", + "params": "{\"email\": \"test@email.com\"}" +} +``` + +where, + +`command` - The command which needs to be executed. This will also help us determine the dag which needs to be called in case of multiple dags. + +`params` - A JSON string which contains all the required arguments for a DAG run. This can vary based on the type of event or command. + +Following is the sample producer code to produce events to the `event_topic` , + +```python +from confluent_kafka import Producer +from confluent_kafka.serialization import StringSerializer +from confluent_kafka.schema_registry import SchemaRegistryClient +from confluent_kafka.schema_registry.avro import AvroSerializer +from confluent_kafka.serialization import SerializationContext, MessageField +import json +import argparse + +def read_ccloud_config(config_file): + conf = {} + with open(config_file) as fh: + for line in fh: + line = line.strip() + if len(line) != 0 and line[0] != "#": + parameter, value = line.strip().split('=', 1) + conf[parameter] = value.strip() + return conf + +def pop_schema_registry_params_from_config(conf): + """Remove potential Schema Registry related configurations from dictionary""" + + conf.pop('schema.registry.url', None) + conf.pop('basic.auth.user.info', None) + conf.pop('basic.auth.credentials.source', None) + + return conf + +if __name__ == '__main__': + + # Read arguments and configurations and initialize + parser = argparse.ArgumentParser(description='Kafka Producer') + parser.add_argument('--config_file', required=True,help='Absolute path to configurations file that contains properties used to configure a Kafka producer') + parser.add_argument('--topic', required=False, help='Kafka topic to produce messages to.') + config_file = args.config_file + topic = args.topic + conf = read_ccloud_config(config_file) + + schema_registry_conf = { + 'url': conf['schema.registry.url'], + 'basic.auth.user.info': conf['basic.auth.user.info']} + + schema_registry_client = SchemaRegistryClient(schema_registry_conf) + + event_schema = { + "type" : "record", + "name" : "EventSchema", + "namespace" : "event", + "fields" : [ { + "name" : "command", + "type" : "string", + "doc" : "Command to process and the DAG to trigger" + }, + { + "name" : "params", + "type" : "string", + "doc" : "Arguments for event workflow specific DAG" + } ], + "doc:" : "Schema of example command" + } + + schema_str = json.dumps(event_schema) + + event_avro_serializer = AvroSerializer(schema_registry_client = schema_registry_client, + schema_str = schema_str) + + string_serializer = StringSerializer('utf_8') + + producer_conf = pop_schema_registry_params_from_config(conf) + producer = Producer(producer_conf) + + delivered_records = 0 + + # Optional per-message on_delivery handler (triggered by poll() or flush()) + # when a message has been successfully delivered or + # permanently failed delivery (after retries). + def acked(err, msg): + global delivered_records + """Delivery report handler called on + successful or failed delivery of message + """ + if err is not None: + print("Failed to deliver message: {}".format(err)) + else: + delivered_records += 1 + print("Produced record to topic {} partition [{}] @ offset {}" + .format(msg.topic(), msg.partition(), msg.offset())) + + event_json = { + "event" : "app.send_email_alert", + "params": "{\"email\": \"test@email.com\"}" + } + producer.produce(topic=topic, + key=string_serializer("event"), + value=avro_serializer(event_json, SerializationContext(topic, MessageField.VALUE)), + on_delivery=acked) + + print("\nFlushing records...") + producer.flush() +``` + +Run the above python code with topic and config file as arguments, + +```bash +python producer.py --topic event_topic --config_file kafka_server.config +``` + +## Download and setup Airflow + +For this tutorial purposes, we will run airflow in docker. For Production, Airflow supports helm charts for Kubernetes deployment. We will be following this [link](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html) to set up Airflow in docker. + +1. Open a terminal window and navigate to the working directory. + + ```bash + cd airflow_kafka + ``` + +2. Fetch the `docker-compose.yml` file, + + ```bash + curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.3.2/docker-compose.yaml' + ``` + +3. Create the required folders for docker compose and set the airflow uid (for Linux), + + ```bash + mkdir -p ./dags ./logs ./plugins + echo -e "AIRFLOW_UID=$(id -u)" > .env + ``` + +4. Run database migrations and create the first user account, + + ```bash + docker-compose up airflow-init + ``` + +5. Now the Airflow is set up and can be started by executing, + + ```bash + docker-compose up -d + ``` + + +Airflow web UI can be accessed at [http://localhost:8080/](http://localhost:8080/) + +## Define Event Workflow DAG + +Event-workflow dags should be defined in the Airflow `dags` folder for the airflow to recognize it. A dag can be a sequence of tasks to be executed to process an incoming event. In this example, a DAG to send email alerts on trigger will be used. This concept can be extended to any type of DAGs with different operators. + +`/opt/airflow/dags/send_email.py` + +```python +from datetime import datetime, timedelta +from airflow import DAG +from airflow.operators.email_operator import EmailOperator + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2023, 3, 1), + 'email_on_failure': True, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + +dag = DAG( + 'send_email_alert', + default_args=default_args, + schedule_interval='@once', +) + +send_email_task = EmailOperator( + task_id='send_alert_task', + to='{{ dag_run.conf["email"] if dag_run else "admin@email.com" }}', + subject='Alert Email from Airflow', + html_content='

    This is a alert email sent from Airflow.

    ', + dag=dag, +) + +send_email_task + +``` + +In this example, we've defined a DAG called `send_email_alert` that runs once (**`schedule_interval='@once'`**) and has a single task called **`send_email_task`**. The **`EmailOperator`** is used to send the email, and we've defined the recipient, subject, and body of the email as arguments to the operator. The recipient parameter is passed from dag run conf parameter. We've also specified some default arguments for the DAG like the number of times to retry the task if it fails etc. + +![EmailDAG.png](../assets/blog-images/command_processing_blog/EmailDAG.png) + +To run this DAG, you'll need to have a working SMTP server set up to send the email. You can configure the SMTP server settings in your Airflow configuration file. + +`/opt/airflow/ansible.cfg` + +``` +# If you want airflow to send emails on retries, failure, and you want to use +# the airflow.utils.email.send_email_smtp function, you have to configure an +# smtp server here +smtp_host = smtp.gmail.com +smtp_starttls = True +smtp_ssl = False +smtp_user = your gmail id +# smtp_password = your gmail password +smtp_port = 25 +smtp_mail_from = give the email, from which email id you want send the mails(your mail id ) +``` + +## Process Command Stream and trigger DAG + +Event streams need to be consumed by a Dispatcher (Kafka Consumer) which knows how to process an incoming event or action. In this case, a dispatcher decides which event workflow DAG needs to be triggered based on the incoming events. The dispatcher should be able to talk to Airflow and trigger the appropriate DAG. We will use the Airflow HTTP API to trigger the DAG run. + +Following is an example python dispatcher code, + +```python +from confluent_kafka import Consumer +from confluent_kafka.serialization import StringDeserializer +from confluent_kafka.schema_registry import SchemaRegistryClient +from confluent_kafka.schema_registry.avro import AvroDeserializer +import json +import argparse + +def read_ccloud_config(config_file): + conf = {} + with open(config_file) as fh: + for line in fh: + line = line.strip() + if len(line) != 0 and line[0] != "#": + parameter, value = line.strip().split('=', 1) + conf[parameter] = value.strip() + return conf + +def pop_schema_registry_params_from_config(conf): + """Remove potential Schema Registry related configurations from dictionary""" + + conf.pop('schema.registry.url', None) + conf.pop('basic.auth.user.info', None) + conf.pop('basic.auth.credentials.source', None) + + return conf + +if __name__ == '__main__': + + # Read arguments and configurations and initialize + parser = argparse.ArgumentParser(description='Kafka Consumer') + parser.add_argument('--config_file', required=True,help='Absolute path to configurations file that contains properties used to configure a Kafka consumer') + parser.add_argument('--topic', required=False, help='Kafka topic to consume messages from.') + config_file = args.config_file + topic = args.topic + conf = read_ccloud_config(config_file) + + schema_registry_conf = { + 'url': conf['schema.registry.url'], + 'basic.auth.user.info': conf['basic.auth.user.info']} + + schema_registry_client = SchemaRegistryClient(schema_registry_conf) + + event_schema = { + "type" : "record", + "name" : "EventSchema", + "namespace" : "event", + "fields" : [ { + "name" : "command", + "type" : "string", + "doc" : "Command to process and the DAG to trigger" + }, + { + "name" : "params", + "type" : "string", + "doc" : "Arguments for event workflow specific DAG" + } ], + "doc:" : "Schema of example command" + } + + schema_str = json.dumps(event_schema) + + event_avro_deserializer = AvroDeserializer(schema_registry_client = schema_registry_client, + schema_str = schema_str) + + string_deserializer = StringDeserializer('utf_8') + + consumer_conf = pop_schema_registry_params_from_config(conf) + consumer = Consumer(consumer_conf) + + while True: + try: + # SIGINT can't be handled when polling, limit timeout to 1 second. + msg = consumer.poll(1.0) + if msg is None: + continue + key = string_deserializer(msg.key()) + event_json = avro_deserializer(msg.value(), SerializationContext(msg.topic(), MessageField.VALUE)) + if event_json is not None: + event_type = event_json["event"] + params = json.loads(event_json["params"]) + # DAG ID is inferred from the event type value + dag_id = event_type.split(".")[1] + payload = {"conf": params} + # The Authorization is the base64 encoded value of Airflow username and password + headers = { + "Content-Type": "application/json", + "Authorization": "Basic YWRtaW46YWRtaW4=" + } + dag_run = requests.post(f'http://localhost:8080/api/v1/dags/{dag_id}/dagRuns', data=json.dumps(payload), headers=headers) + dag_response = dag_run.json() + print(dag_response) + except KeyboardInterrupt: + pass + finally: + # Leave group and commit final offsets + consumer.close() + +``` + +The `dag_id` should be dynamically inferred from the event message based on the schema followed to maximise the range of dags covered using a single dispatcher and automate the process. In this example, the `dag_id` is inferred from the `command` column of the event message. For example, if `command` is `app.send_email_alert`, the DAG id is assumed to be `send_email_alert` . + +Run the above python code with topic and config file as arguments, + +```bash +python dispatcher.py --topic event_topic --config_file kafka_server.config +``` + +The triggered DAG and its status can be viewed in the UI. + +## Conclusion + +In conclusion, we can build robust, scalable, and flexible event workflows in Apache Airflow and trigger them by processing commands from Apache Kafka in real time. Kafka allows you to stream events from various sources and distribute them to multiple consumers, while Airflow provides a flexible framework for orchestrating the processing of these events using event-specific DAGs. With Airflow's built-in monitoring and alerting capabilities, you can easily track the progress of your pipelines and receive notifications if anything goes wrong. Overall, using Kafka and Airflow together can help to streamline your event processing workflows and improve your data processing capabilities. diff --git a/_posts/2023-03-30-kafka-client-performance-metrics.md b/_posts/2023-03-30-kafka-client-performance-metrics.md new file mode 100644 index 0000000000..dce1d69dd8 --- /dev/null +++ b/_posts/2023-03-30-kafka-client-performance-metrics.md @@ -0,0 +1,523 @@ +--- +layout: post +title: "The Kafka benchmarking suite" +categories: [Kafka, Kubernetes, Performance, Helm] +featured: false +image: assets/blog-images/kafka-perf-suite/kafka-benchmark-metrics-featured.png +teaser: "Distributed testing grid for Kafka on top of Kubernetes" +authors: badri,p6 +toc: true +--- + +There are numerous parameters we have to consider tweaking when benchmarking a Kafka cluster. Irrespective of these parameters, we optimize along the following dimensions: + +1. Throughput: The volume of data moved across the cluster in a given unit of time. +2. Durability: Are we minimizing the chance of messages getting lost +3. Availability: How quickly can my Kafka cluster recover from a failure scenario and be operational again? +4. Latency: How soon can a message be consumed after it is produced? + +## Defining our problem statement + +The purpose of a Kafka benchmarking exercise is to initially establish baseline performance characteristics, mainly for throughput and end to end latency. We then iteratively optimize these quantities by modifying various producer and consumer parameters, running the benchmark again, measuring the quantities. These steps are done in a “rinse and repeat” fashion till we hit our desired performance metrics. + +There are quite a few performance testing frameworks in the open which aid in doing this. We prepare a wishlist of items required for a good benchmarking system, like: + +1. It should be close to a realistic production scenario. No toy benchmarking. +2. Ability to snapshot and visualize the data from various test runs. +3. Ease of setup. The benchmarking system itself shouldn’t be complicated to setup with a lot of configuration landmines. Nothing is more dangerous than a misconfigured benchmarking system which gives us false promises. + +Let’s see how existing setups measure up against this wishlist. + +## Openmessaging benchmarks + +The [Openmessaging benchmark framework](https://openmessaging.cloud/docs/benchmarks/) is a vendor-neutral suite of tools which eases benchmarking of distributed benchmarking systems. In addition to Kafka, this framework provides benchmarking tools for other messaging systems like RabbitMQ and Apache Pulsar. +In essence, Openmessaging benchmarking involves the following steps broadly: + +1. Spin up the infrastructure needed to run Kafka in the cloud using Terraform +2. Provision and install Kafka clusters and clients using Ansible playbook +3. Run benchmarks from client hosts. NOTE that this can be run in a distributed manner as well across multiple hosts. + +One of the biggest selling points of Openmessaging benchmarks is that they are as close to realistic scenarios as possible. In fact, this is one of the stated goals of the project. There is also the provision to download the metrics data for each test run/scenario. + +We observed a few shortcomings with Openmessaging benchmarks. + +1. The inability to measure end to end latency. This is a key metric in many scenarios and there is no documented way to measure this in Openmessaging suite. +2. Ease of setup. Openmessaging provides all the necessary scaffold scripts like Terraform and Ansible to run the test. This could be made easier. There is an option to deploy it as a [Helm chart](https://github.com/openmessaging/benchmark/blob/master/deployment/kubernetes/helm/README.md), which closely resembles our approach as well. But this isn’t documented. +3. As a sub-idea of 2, we don’t have an easy way to visualize these numbers. We all love to see the big picture instead of staring at a spreadsheet full of numbers. It would be great if we can pipe the output straight to tools like Grafana. + +## Apache Trogdor + +[Apache Trogdor](https://github.com/apache/kafka/tree/trunk/trogdor) is a test framework for Kafka. Although it does excellent performance testing, Trogdor emphasizes a lot of [fault testing](https://cwiki.apache.org/confluence/display/KAFKA/Fault+Injection). The idea is to run a Trogdor agent in every cluster node and a Trogdor coordinator daemon which manages the agents and tasks in the cluster. The agent can simulate a fault by crashing one of the brokers. We measure the message durability, adjust the parameters and re-run the process again. Confluent cloud [uses Trogdor](https://www.confluent.io/blog/cloud-kafka-as-a-service/) to performance test its infrastructure. + +Again, we found the following things lacking in Trogdor. + +1. No way to measure end to end latency. The [RoundTripWorkload](https://github.com/apache/kafka/tree/trunk/trogdor#roundtripworkload) is a close match for what we were looking for, but doesn’t quite cut it. +2. It doesn’t emit metrics which can be visualized. There is a provision to show the status of a submitted task. This gives a snapshot of the task(a task is a specific, defined benchmark scenario), like time taken, number of messages sent, 99th percentile latency etc. +3. We have to run the agent and coordinator daemon processes in the cluster. Sometimes, due to various limitations, we might not be able to do so. + +## Jmeter + +Jmeter is an excellent tool to benchmark Kafka clusters and scores pretty high on the “ease of setup” factor. [PepperBox](https://pepperbox.gslab.com/), a specialized Kafka load generator for Jmeter, is a prerequisite for running load tests using Jmeter. However, Jmeter suffers from the following limitations: + +1. We have to write the consumer client code for running the end to end tests. +2. There is no way to measure the latency. We had to tweak the consumer to write the timestamp to a file. Ideally, there should be a complementing tool for PepperBox at the consumer end for this. + +## Kafka perf tools + +Kafka offers a set of performance testing tools for the producer, consumer and also to measure end to end latency. This is the least common denominator of all the tools out there which fits our bill, but we’re not quite out there yet. We don’t have the option to load test a production replica scenario(Ex. Parallel producers/consumers). There’s also the “ease of setup” factor missing. We have to clean up the resources and re-run the process for a new scenario. Finally, there is no way to record and view the metrics. We will build a Prometheus based scaffold to send these metrics. In the rest of the post, we shall focus on how we’ve fixed these issues and created a robust and reliable performance testing framework which fulfills our wishlist. + +## Prerequisites + +You should have a Kafka cluster which needs to be benchmarked(Duh!). Arguably the fastest way to get your hands on a fairly production ready Kafka cluster is to sign up for a Confluent cloud account and spin up a new cluster. + +## Benchmarking using kafka provided tools + +For those of you who are new to benchmarking using Kafka perf-tools, here’s a brief recap. +First, you create a new topic where you want to send your records. + +```bash +$ kafka-topics \ # <-------- (1) + --if-not-exists \ + --topic mytopic \ # <-------- (2) + --create \ + --bootstrap-server xxx-yyyy.us-west4.gcp.confluent.cloud:9092 \ # <-------- (3) + --replication-factor 3 \ # <-------- (4) + --partitions 1 \ # <-------- (5) + --command-config kafka.properties # <-------- (6) +``` + +1. `kafka-topics` is the CLI tool which ships with [Apache Kafka](https://kafka.apache.org/downloads)(as `kafka-topics.sh`) and [confluent platform package](https://docs.confluent.io/platform/current/installation/installing_cp/zip-tar.html#prod-kafka-cli-install). +2. Name of the topic you want to create +3. The bootstrap server URL. You get this from your confluent cloud account. +4. Replication factor for your topic. Please update it with your desired replication factor. +5. The number of partitions for the topic. Again, update it to suit your needs +6. The `kafka.properties` file, needed for all the CLI tools we will run. + +Let’s quickly take a look at the contents of kafka.properties. + +``` +bootstrap.servers=xxx-yyyy.us-west4.gcp.confluent.cloud:9092 # <-- (1) +security.protocol=SASL_SSL # <-- (2) +sasl.jaas.config=org.apache.kafka.common.security.plain.PlainLoginModule required username='BADRI123SZK5QZJI' password='Supersecretlongpasswordgivenbyconfluentcloud'; # <-- (3) +sasl.mechanism=PLAIN # <-- (4) +``` + +1. The bootstrap server URL, which you can obtain from confluent cloud. +2. Confluent cloud uses SASL_SSL for authentication. +3. The user name and password can be obtained from confluent cloud. + +**NOTE** that this properties file will be used across the board for the producer and consumer benchmark tools as well. + +Time to pump in some messages to this topic. + +```bash +$ kafka-producer-perf-test \ # <-- (1) + --topic mytopic \ # <-- (2) + --num-records 10000 \ # <-- (3) + --record-size 1024 \ # <-- (4) + --throughput -1 \ # <-- (5) + --producer-props acks=all client.id=producer-01 \ # <-- (6) + --producer.config kafka.properties + --print-metrics # < -- (7) +``` + +1. We run the `kafka-producer-perf-test` CLI tool for this. +2. We send messages to the topic we created just now. +3. We’re sending 10k records. +4. The size of each of these records is 1024 bytes. +5. We don’t want to throttle the number of messages per second. Hence, we set it to -1. +6. Number of acknowledgements the leader broker must have received before acknowledging to the producer. We also specify the client ID here. +7. Print the producer metrics. + +We should get an output which looks similar to this. + +``` +1 records sent, 0.2 records/sec (0.00 MB/sec), 5289.0 ms avg latency, 5289.0 ms max latency. +1215 records sent, 240.6 records/sec (0.24 MB/sec), 4727.5 ms avg latency, 7196.0 ms max latency. +1275 records sent, 246.3 records/sec (0.24 MB/sec), 9746.1 ms avg latency, 12365.0 ms max latency. +1500 records sent, 269.5 records/sec (0.26 MB/sec), 14727.0 ms avg latency, 17925.0 ms max latency. +1425 records sent, 283.6 records/sec (0.28 MB/sec), 20428.6 ms avg latency, 22947.0 ms max latency. +1650 records sent, 320.0 records/sec (0.31 MB/sec), 25312.8 ms avg latency, 28097.0 ms max latency. +1200 records sent, 238.0 records/sec (0.23 MB/sec), 30665.2 ms avg latency, 33137.0 ms max latency. +1200 records sent, 212.7 records/sec (0.21 MB/sec), 35153.8 ms avg latency, 38780.0 ms max latency. +10000 records sent, 223.613596 records/sec (0.22 MB/sec), 21165.35 ms avg latency, 41544.00 ms max latency, 21647 ms 50th, 38785 ms 95th, 40823 ms 99th, 41544 ms 99.9th. + +Metric Name Value +app-info:commit-id:{client-id=producer-1} : 06652ad189d0cbf0 +app-info:start-time-ms:{client-id=producer-1} : 1653549654259 +app-info:version:{client-id=producer-1} : 6.2.1-ce +kafka-metrics-count:count:{client-id=producer-1} : 113.000 +producer-metrics:batch-size-avg:{client-id=producer-1} : 15548.256 +producer-metrics:batch-size-max:{client-id=producer-1} : 15556.000 +producer-metrics:batch-split-rate:{client-id=producer-1} : 0.000 +producer-metrics:batch-split-total:{client-id=producer-1} : 0.000 +producer-metrics:buffer-available-bytes:{client-id=producer-1} : 33554432.000 + +... +producer-topic-metrics:record-error-total:{client-id=producer-1, topic=mytopic} : 0.000 +producer-topic-metrics:record-retry-rate:{client-id=producer-1, topic=mytopic} : 0.000 +producer-topic-metrics:record-retry-total:{client-id=producer-1, topic=mytopic} : 0.000 +producer-topic-metrics:record-send-rate:{client-id=producer-1, topic=mytopic} : 250.752 +producer-topic-metrics:record-send-total:{client-id=producer-1, topic=mytopic} : 10000.000 +``` + +Let’s consume the messages we created. + +```bash +$ kafka-consumer-perf-test \ + --topic mytopic \ # <-- (1) + --bootstrap-server xxx-yyyy.us-west4.gcp.confluent.cloud:9092 \ # <-- (2) + --messages 1000 \ # <-- (3) + --consumer.config kafka.properties # <-- (4) + --print-metrics \ + --timeout=100000 # <-- (5) +``` + +1. We use the same topic. +2. We have to specify the bootstrap server. +3. The number of messages we want to consume. +4. We refer to the same kafka.properties file. +5. The amount of time the consumer process waits before the broker returns records. + +Here’s a sample output from a consumer perf test run. + +``` +start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec +2023-03-30 13:18:56:066, 2023-03-30 13:19:11:826, 0.9766, 0.0620, 1000, 63.4518, 6219, 9541, 0.1024, 104.8108 + +Metric Name Value +consumer-coordinator-metrics:assigned-partitions:{client-id=consumer-perf-consumer-24667-1} : 0.000 +consumer-coordinator-metrics:commit-latency-avg:{client-id=consumer-perf-consumer-24667-1} : 218.500 +consumer-coordinator-metrics:commit-latency-max:{client-id=consumer-perf-consumer-24667-1} : 222.000 +consumer-coordinator-metrics:commit-rate:{client-id=consumer-perf-consumer-24667-1} : 0.057 +consumer-coordinator-metrics:commit-total:{client-id=consumer-perf-consumer-24667-1} : 2.000 +consumer-coordinator-metrics:failed-rebalance-rate-per-hour:{client-id=consumer-perf-consumer-24667-1} : 88.703 +consumer-coordinator-metrics:failed-rebalance-total:{client-id=consumer-perf-consumer-24667-1} : 1.000 +consumer-coordinator-metrics:heartbeat-rate:{client-id=consumer-perf-consumer-24667-1} : 0.081 + +... +consumer-fetch-manager-metrics:records-lag-max:{client-id=consumer-perf-consumer-24667-1} : 41500.000 +consumer-fetch-manager-metrics:records-lag:{client-id=consumer-perf-consumer-24667-1, topic=mytopic, partition=0} : 41000.000 +consumer-fetch-manager-metrics:records-lead-avg:{client-id=consumer-perf-consumer-24667-1, topic=mytopic, partition=0} : 750.000 +consumer-fetch-manager-metrics:records-lead-min:{client-id=consumer-perf-consumer-24667-1, topic=mytopic, partition=0} : 500.000 +consumer-fetch-manager-metrics:records-lead-min:{client-id=consumer-perf-consumer-24667-1} : 500.000 +consumer-fetch-manager-metrics:records-lead:{client-id=consumer-perf-consumer-24667-1, topic=mytopic, partition=0} : 1000.000 +consumer-fetch-manager-metrics:records-per-request-avg:{client-id=consumer-perf-consumer-24667-1, topic=mytopic} : 1000.000 +consumer-fetch-manager-metrics:records-per-request-avg:{client-id=consumer-perf-consumer-24667-1} : 1000.000 +kafka-metrics-count:count:{client-id=consumer-perf-consumer-24667-1} : 61.000 +``` + +The typical way to run Kafka benchmarks is to take a set of parameters for the producer and consumer, do a set of sample runs with those parameters, and record the metrics we get. We repeat this loop until we get the desired numbers. This can be likened to an OODA (Observe Orient Decide Act) loop, where the mapping looks like this: + +- Observe - Look at the printed metrics for each run. +- Orient - Find out what configuration tweak led to this set of outcomes. +- Decide - Figure what configuration parameters to change for next run. +- Act - Run the benchmarks CLI program. + +Applying the OODA loop to the plain vanilla kafka benchmark CLI tools can get messy quickly. For instance, what should I be doing to run 20 parallel producers? Also, how can I move all these metrics to a central location for later analysis/charting? +What numbers did we get the last time we ran for a particular set of config values? + +We fix these open questions in the rest of the post. + +## Take 2 - Kubernetes + +One major change in our benchmarking procedure will be the fact that we will move all our benchmark CLI tools to Kubernetes services. So, the prerequisite here would be for you to have a working Kubernetes cluster with the kubectl client. + +Let’s try running this as a Kubernetes Job. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: producer # <-------- (1) + labels: + type: producer +spec: + completions: 3 # <-------- (2) + parallelism: 3 + template: + spec: + initContainers: # <-------- (3) + - name: topics + image: confluentinc/cp-kafka:7.3.2 # <-------- (4) + command: + - /bin/sh + - -c + - | + kafka-topics \ + --if-not-exists \ + --topic mytopic \ + --create \ + --bootstrap-server xxx-yyyy.us-west4.gcp.confluent.cloud:9092 \ + --replication-factor 3 \ + --partitions 1 \ + --command-config /mnt/kafka.properties + volumeMounts: + - name: kafka-properties # <-------- (5) + mountPath: /mnt + containers: + - name: producer + image: confluentinc/cp-kafka:7.3.2 + command: + - /bin/sh + - -c + - | + kafka-producer-perf-test \ # <-------- (6) + --topic perf-test \ + --num-records 10000 \ + --record-size 1024 \ + --throughput -1 \ + --producer-props acks=1 client.id=foo batch.size=1000 linger.ms=100 compression.type=lz4 \ + --producer.config /mnt/kafka.properties + volumeMounts: + - name: kafka-properties + mountPath: /mnt + volumes: + - name: kafka-properties + secret: + secretName: "kafka-client-config" + restartPolicy: Never + backoffLimit: 4 +``` + +1. We run a job called "producer". +2. Indicates the number of parallel producer jobs we want to run. +3. We run an initcontainer which creates the topic if it doesn’t exist. +4. The latest confluent kafka container image at the time of writing this. Please substitute with a more recent image if applicable. +5. We model the `kafka.properties` as a kubernetes secret and mount it here as a volume. +6. We run the actual producer perf test process with all the parameters. + +Now, we can run this in a Kubernetes cluster and get the same OODA loop running. But what changed, other than moving everything inside containers? For starters, we can parallelize the process simply by increasing the `parallelism` spec of the job. + +Also, we can ship these metrics to Prometheus if we bake the kafka container image with a JMX agent. + +```Dockerfile +FROM confluentinc/cp-kafka:7.3.2 # <---------- (1) + +COPY jmx /usr/app # <---------- (2) +# Install Prometheus +RUN curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest \ + | grep browser_download_url \ + | grep linux-amd64 \ + | cut -d '"' -f 4 \ + | wget -qi - && \ + tar xf prometheus*.tar.gz && \ + cd prometheus*/ && \ + mv prometheus .. # <---------- (3) +``` + +1. We start with the official Confluent Kafka base image. +2. We copy over the JMX agent jar and corresponding YAML configuration file tailored for Kafka. +3. We install Prometheus binary. + +Prometheus is typically configured for pull based metrics. In our case, we run the prometheus binary in a push-based fashion. We have a JMX server running in port `7071` for the duration of the Job. We configure Prometheus to run in agent mode, where it will fetch metrics from the JMX server and push to a pre-configured write endpoint in the Prometheus server. + +```bash +./prometheus --enable-feature=agent --config.file="/prom/prometheus.yml" --log.level=error +``` + +Let's take a quick look at the prometheus agent configuration. + +```yaml +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: "jmx" # <---- (1) + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ["localhost:7071"] # <---- (2) + labels: + env: "dev" # <---- (3) + relabel_configs: + - source_labels: [__address__] + target_label: hostname + regex: "([^:]+)(:[0-9]+)?" + replacement: "${1}" +remote_write: + - url: "http://your-prometheus-url/api/v1/write" # <---- (4) +``` + +1. Name of the prometheus Job. +2. The target from which we scrape the metrics. +3. A unique identifier for every run. +4. The Prometheus server url to write the metrics to. + +**NOTE** that the Prometheus server should have "remote write receiver" [enabled](https://prometheus.io/docs/prometheus/latest/feature_flags/#remote-write-receiver) for the prometheus agent to send JMX metrics. + +I’d argue that we didn’t gain much from this transition, except for the Prometheus integration. I still have to change the config parameters for the next run, kubectl apply it, and track the metrics. We can do better. We could run a Helm chart with a parameterized set of input values and ship off these metrics to Prometheus. Before we go into the details of the Helm chart, let's define how a typical testing loop looks like. + +1. Come up with a baseline test on standard hardware / test conditions +2. Identify specific scenarios (these translate into producer and consumer configs) +3. Configure accordingly and run the test +4. Look at the client "blackbox" profile for throughput, latency +5. Look at the client "whitebox" profile - extended JMX metrics +6. Look at the server-side "blackbox" profile - throughput, latency +7. Look at the server-side "whitebox" profile - extended JMX metrics +8. Look at the log levels both client and server side - if needed, increase it to look at specific problems, if any + +Repeat-rinse steps 3 - 8 and infer performance based on deviation from the baseline. + +## Take 3 - Helm chart + +We take the following artifacts, + +1. The kafka.config secret +2. Producer perf test job +3. Consumer perf test job +4. Prometheus config + +And templatize them, package them as a Helm chart. We model every iteration in our OODA loop as a new helm release with its own set of helm values. + +The Helm chart does the following: + +1. Run a distributed set of producer jobs(defined by the `producer.count` parameter) +2. run a distributed consumer - simplest is to have an array of consumers (and topics) defined so we don't get into managing concurrency; instead, we just create `consumer.count` number of consumer Jobs to paralelly consume from the topic. + +**Note** that we will need the consumer to run with a countdown latch or timeout. + +Here’s a performance test release with values optimized for throughput. + +```yaml +topic: + name: "t1_benchmark" + replicationFactor: 3 + partitions: 1 + +broker: + url: "xxx-yyyy.us-west4.gcp.confluent.cloud:9092" + username: "XXXXX" + password: "YXYXYXYXY2341234wesfdf345234" + +producer: + enabled: true + image: "confluentinc/cp-kafka:7.3.2" + recordsCount: 1000 + recordSizeBytes: 1024 + throughput: "-1" + acks: 1 + count: 3 + params: "batch.size=100000 linger.ms=100 compression.type=lz4" + +consumer: + enabled: true + image: "confluentinc/cp-kafka:7.3.2" + messagesCount: 1000 + count: 1 + timeout: 100000 +``` + +And here’s another set of values optimized for high durability. + +```yaml +topic: + name: "t1_benchmark" + replicationFactor: 3 + partitions: 1 + +broker: + url: "xxx-yyyy.us-west4.gcp.confluent.cloud:9092" + username: "XXXXX" + password: "YXYXYXYXY2341234wesfdf345234" + +producer: + enabled: true + image: "confluentinc/cp-kafka:7.3.2" + recordsCount: 1000 + recordSizeBytes: 1024 + throughput: "-1" + acks: "all" + count: 3 + params: "enable.idempotence=true max.in.flight.requests.per.connection=1" + +consumer: + enabled: true + image: "confluentinc/cp-kafka:7.3.2" + messagesCount: 1000 + count: 1 + timeout: 100000 +``` + +### A quick aside on Prometheus and Grafana + +The code accompanying this post ships with a custom Helm chart which piggy backs on the [Kube Prometheus stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install Prometheus, Grafana and a couple of associated producer and consumer dashboards. This chart will also wire Prometheus as a datasource for the Grafana installation. + +You can install this setup using the following command. + +```bash +helm upgrade --install prom-stack ./kube-prometheus-kafka-chart +``` + +The Prometheus console can be viewed in the browser by issuing a port-forward command. + +```bash + kubectl port-forward svc/prometheus-operated 9090:9090 +``` + +We can do something similar for the Grafana web console as well. + +```bash +kubectl port-forward svc/prom-stack-grafana 8080:80 +``` + +Grafana requires credentials, which are `admin` and `platformatory`(the default password in the Helm chart). + +### Running the performance tests + +Once we have the scaffold ready, we run a helm release using the following command, + +```bash +helm upgrade --install march-25-2023-durability-producer ./kafka-performance-metrics-chart --values durability-producer-values.yml +``` + +This is how a typical producer dashboard would look like. + +![Producer Dashboard 1](../assets/blog-images/kafka-perf-suite/producer-dashboard-01.png) + +![Producer Dashboard 2](../assets/blog-images/kafka-perf-suite/producer-dashboard-02.png) + +![Producer Dashboard 3](../assets/blog-images/kafka-perf-suite/producer-dashboard-03.png) + +Next, we run the corresponding consumer job using the Helm chart. + +![Consumer Dashboard 1](../assets/blog-images/kafka-perf-suite/consumer-dashboard-01.png) + +![Consumer Dashboard 2](../assets/blog-images/kafka-perf-suite/consumer-dashboard-02.png) + +![Consumer Dashboard 3](../assets/blog-images/kafka-perf-suite/consumer-dashboard-03.png) + +```bash +helm upgrade --install march-25-2023-durability-consumer ./kafka-performance-metrics-chart --values durability-consumer-values.yml +``` + +...and process the consumer dashboard. + +### Shipping metrics to other systems + +The performance metrics Helm chart has provision to write openmetrics to any system which supports the Prometheus remote write feature. This configuration can be changed in the `values.yaml` as illustrated below: + +```yaml +prometheus: + remote_write: + - url: "http://prom-stack-kube-prometheus-prometheus:9090/api/v1/write" +``` + +As a quick example, if you want to use New Relic to process the metrics, your configuration would look like this: + +```yaml +prometheus: + remote_write: + - url: https://metric-api.newrelic.com/prometheus/v1/write?prometheus_server=kafka-perf-test + bearer_token: xxxxxxxxxxxxxxxxxxxxxxxxxxx +``` + +The Helm charts and the associated code mentioned in this post can be found [here](https://github.com/Platformatory/kafka-performance-suite). diff --git a/_posts/2023-06-28-End-To-End-Streaming-Pipeline-using-KSQLDB-and-Kafka-Connect.md b/_posts/2023-06-28-End-To-End-Streaming-Pipeline-using-KSQLDB-and-Kafka-Connect.md new file mode 100644 index 0000000000..866c63aa7d --- /dev/null +++ b/_posts/2023-06-28-End-To-End-Streaming-Pipeline-using-KSQLDB-and-Kafka-Connect.md @@ -0,0 +1,424 @@ +--- +layout: post +title: "End-To-End Streaming Pipeline using KSQLDB and Kafka Connect" +author: ashwin +categories: [ KSQL, Kafka Connect, Confluent Cloud, Kafka, Stream Design, Docker] +image: assets/blog-images/e2e_ksql_connect_blog/end-to-end-pipeline.jpg +teaser: Complete End-to-End stream processing pipeline using KSQLDB and Kafka Connect with both Confluent Cloud and Standalone Confluent Platform +toc: true +--- + +## Introduction + +KSQLDB and Kafka Connect are two technologies that are commonly used together in the Apache Kafka ecosystem for stream processing and data integration. + +Kafka Connect is a framework for building and running connectors between Kafka and other data systems, such as databases, file systems, and message queues. Connectors are plugins that are responsible for moving data between Kafka and external systems, either in a one-way or a two-way fashion. Kafka Connect provides a scalable and fault-tolerant way of integrating data between different systems. + +KSQLDB, on the other hand, is a streaming SQL engine for Apache Kafka that allows developers to write SQL queries on top of Kafka streams. KSQLDB makes it easy to perform stream processing tasks, such as filtering, aggregating, and joining streams of data in real-time. KSQLDB also provides support for stream-table joins, which allows developers to join a Kafka stream with a Kafka table. + +## KSQLDB and Kafka Connect in Confluent Architecture + +Confluent Cloud is a fully managed cloud-native event streaming platform that provides a scalable and secure way to build real-time streaming applications in the cloud. It offers connectors that are pre-configured, fully managed, and can be deployed with just a few clicks in the Confluent Cloud Console or via the Confluent Cloud CLI. It also provides a user-friendly web interface for managing and executing queries in KSQLDB, as well as an API for programmatic access. + +Confluent Platform, on the other hand, is a self-managed version of the Confluent stack that can be deployed on-premises or in the cloud. Confluent Platform provides additional features such as multi-datacenter replication, advanced security options, and enhanced monitoring and management capabilities. The Kafka Connect and KSQLDB servers are self managed and available through HTTP API for use. The required connectors need to be manually loaded into Kafka Connect server and configured. + +In summary, both Confluent Cloud and Confluent Platform provide Kafka clusters, Kafka Connect, and KSQLDB, but Confluent Platform offers additional features and customization options for those who require more control over their Kafka infrastructure. + +## Managed vs Self-hosted + +### Operation Cost + +Confluent Cloud requires a Kafka cluster for using Connectors and KSQLDB. It primarily offers three types of clusters, Basic, Standard and Dedicated. The pricing are as follows, + +- Basic - $0 per hour +- Standard - $1.50 per hour +- Dedicated - Cost based on number of CKUs (Confluent Unit for Kafka) + +The connector deployments are priced based on the connector task instance and the amount of data being transferred. Please look at this [link](https://www.confluent.io/confluent-cloud/connect-pricing/) for more details. + +KSQLDB servers are priced as **$0.23 per CSU hour**. A CSU (Confluent Streaming Unit) is the compute unit for fully managed ksqlDB. Clusters can be configured with 1, 2, 4, 8, or 12 CSUs based upon the workload. + +In Confluent Platform, both Kafka Connect cluster and KSQLDB server are available under Confluent Community license and does not have any licensing cost. The components have its own requirement on Memory and CPU based on the expected usage, so the cost of setting up the infrastructure locally or in cloud exists. The commercial connectors still needs to be licensed before use in Kafka Connect. + +### Deployment Options + +Kafka Connect and KSQL clusters can be deployed in Confluent Cloud using any of the following methods, + +- Confluent Cloud console +- [Confluent CLI](https://docs.confluent.io/confluent-cli/current/overview.html#confluent-cli-overview) +- [Confluent Terraform provider](https://registry.terraform.io/providers/confluentinc/confluent/latest) +- [Confluent Pulumi provider](https://www.pulumi.com/registry/packages/confluentcloud/) + +Both Terraform and Pulumi providers automate deployments and offer version control when used in conjunction with Github, Gitlab etc. + +Kafka Connect and KSQL clusters can be deployed in Confluent Platform using any of the following methods, + +- [Manual Confluent Package Installation](https://docs.confluent.io/platform/current/installation/overview.html#install-cp-on-premises) +- [Docker images](https://docs.confluent.io/platform/current/installation/docker/installation.html) +- [Ansible playbook for Confluent Platform](https://docs.confluent.io/ansible/current/overview.html) +- [Confluent for Kubernetes](https://docs.confluent.io/operator/current/overview.html) + +Ansible playbooks offer a simple automated way to deploy Confluent Platform components in multiple machines. Confluent for Kubernetes (CFK) is a cloud-native control plane for deploying and managing Confluent in your private cloud environment. + +Based on this execution model and depending on how we want to run our queries, currently, ksqlDB provides two deployment options. + +- ****Headless ksqlDB deployments -**** In the headless mode, you write all of your queries in a SQL file, and then start ksqlDB server instances with this file as an argument. Confluent Platform supports this type of KSQL deployment. +- ****Interactive ksqlDB deployments -**** In interactive mode, you interact with the ksqlDB servers through a REST API either directly via your favorite REST clients, through ksqlDB CLI or through Confluent Control Center. Both Confluent Cloud and Confluent Platform support this type of KSQL deployment. + +### Multi-Tenancy + +Kafka Connect and KSQLDB does not natively support multi-tenancy in both Confluent Cloud and Confluent Platform, but it is possible to implement a multi-tenancy model. + +Deploy multiple Kafka Connect and KSQLDB clusters, each dedicated to a specific tenant or application. Each Kafka Connect cluster can be configured with its own set of connectors and configurations, and can run on a separate set of hardware resources. This will lead to resource isolation in terms of connector tasks and scaling resources based on application requirements. + +Similarly, each KSQLDB cluster can be used to host one application or use case with dedicated resources. Deploying multiple applications in a single KSQL cluster can lead to resource issues because of an application being greedy for resources. It also makes it easier to reason about scaling, failovers, replication and resource utilization. + +### Future + +KSQLDB provides the ability to define and use user-defined functions (UDFs) in SQL queries. UDFs allow developers to extend the functionality of KSQLDB by defining their own custom functions that can be used in SQL expressions. As of now, the support for KSQL UDF is only present in Confluent Platform (self-hosted) deployments. Support for KSQL UDF might be introduced for Confluent Cloud as well. + +Recently, Confluent acquired Immerok which is a contributor to the open source stream processing application technology Apache Flink. So, there might be a support released for FlinkSQL in KSQLDB in the future. + +## End-To-End Streaming Pipeline + +We will now build a sample end-to-end streaming pipeline using Kafka Connect and KSQLDB in both Confluent Cloud and Confluent Platform. Both the scenarios will be using Kafka Cluster deployed in Confluent Cloud for Kafka Topic storage. So, please create a Basic Kafka cluster in Confluent Cloud by registering in this [link](https://confluent.cloud/signup). + +![faust_blog_1.png](../assets/blog-images/e2e_ksql_connect_blog/faust_blog_1.png) + +This demo will also require Schema registry cluster for storing Avro schema of the record values. You can create a Schema registry cluster in the Confluent Cloud by enabling the Streams Governance Essentials Package. You can follow this [link](https://docs.confluent.io/cloud/current/stream-governance/packages.html#stream-governance-packages-features-and-limits) for more information. + +![streamsgovernanceimage.png](../assets/blog-images/e2e_ksql_connect_blog/streamsgovernanceimage.png) + +Create a Kafka cluster API Key in your Confluent cloud account from under the Cluster Overview section, so that the connectors and KSQLDB can access data from Kafka topics during stream processing. + +![CreateKafkaApiKey.png](../assets/blog-images/e2e_ksql_connect_blog/CreateKafkaApiKey.png) + +Following are the details of the streaming pipeline, + +1. Deploy a Datagen source connector to inject “Shoe Clickstream” data into the input topic +2. Create a KSQL Stream for the input topic +3. Create a aggregated KSQL table to calculate the total view time per product per user for every 1 minute time window and write it to a output topic. +4. Sink the data from output topic to a Postgres database using the JDBC Sink Connector. + +The connectors used in this demo will be deployed using SQL queries in KSQL in both Confluent Cloud and Confluent Platform deployments. + +![StreamDesignerPipeline.png](../assets/blog-images/e2e_ksql_connect_blog/StreamDesignerPipeline.png) + +### Confluent Cloud + +We will use the Confluent Cloud console to create KSQL cluster but this can be achieved through any of the above mentioned deployment options for Confluent Cloud. + +Create a KSQL cluster in your Confluent Cloud account with Global access (only for testing). + +![CreateKSQLinCC.png](../assets/blog-images/e2e_ksql_connect_blog/CreateKSQLinCC.png) + +Once, the KSQL cluster is provisioned. we will create a Datagen source connector to push data into the input topic `clickstream_cloud_input`. We will use the `SHOE_CLICKSTREAM` quick start data generator and set the data type as `AVRO` for the record values. The Kafka API key pair created above will be used to create the input topic and write data into it. + +```sql +CREATE SOURCE CONNECTOR "ClickstreamDataGenSourceConnector" WITH ( + "connector.class"='DatagenSource', + "kafka.api.key"='${KAFKA_API_KEY}', + "kafka.api.secret"='${KAFKA_API_SECRET}', + "kafka.topic"='clickstream_cloud_input', + "output.data.format"='AVRO', + "quickstart"='SHOE_CLICKSTREAM', + "tasks.max"='1' +); +``` + +Once the Datagen source connector is successfully deployed, we should be able to see the generated records for the input topic in the Confluent Cloud UI. + +![InputTopicMessagesCrop.png](../assets/blog-images/e2e_ksql_connect_blog/InputTopicMessagesCrop.png) + +Let’s create a Kafka stream called `CLICKSTREAM_CLOUD_STREAM` from this input topic `clickstream_cloud_input` for further processing. We will mention the timestamp field in the message value to be used for windowing by using the `timestamp` variable under the `WITH` clause. + +```sql +CREATE OR REPLACE STREAM CLICKSTREAM_CLOUD_STREAM +(PRODUCT_ID VARCHAR, USER_ID VARCHAR, VIEW_TIME INT, +PAGE_URL VARCHAR, IP VARCHAR, TS BIGINT) +WITH (kafka_topic='clickstream_cloud_input', partitions=1, +value_format='AVRO', timestamp='TS'); +``` + +Create a Kafka Table called `USER_PRODUCT_VIEWTIME` backed by a Kafka topic `clickstream_cloud_output` which aggregates the total view time per product per user for every 1 minute time window. The aggregated sum of the view time is saved as `TOTAL_VIEW_TIME` field in the table. + +```sql +CREATE OR REPLACE TABLE USER_PRODUCT_VIEWTIME +WITH (kafka_topic='clickstream_cloud_output', partitions=1, +key_format='AVRO', value_format='AVRO') +AS SELECT USER_ID, PRODUCT_ID, AS_VALUE(USER_ID) AS USERID, +AS_VALUE(PRODUCT_ID) AS PRODUCTID, SUM(VIEW_TIME) AS TOTAL_VIEW_TIME, +WINDOWEND AS END_TIME, WINDOWSTART AS START_TIME +FROM CLICKSTREAM_CLOUD_STREAM +WINDOW TUMBLING (SIZE 1 MINUTES, RETENTION 7 DAYS) +GROUP BY USER_ID, PRODUCT_ID; +``` + +Following, is a sample view of the records received by the ouput topic `clickstream_cloud_output` + +![OutputTopicMessagesCrop.png](../assets/blog-images/e2e_ksql_connect_blog/OutputTopicMessagesCrop.png) + +Let’s create a Postgres JDBC Sink connector which writes the aggregated values from the output topic to a remote Postgres database. + +```sql +CREATE SINK CONNECTOR "PostgresClickstreamSinkConnector" WITH ( + 'connector.class' = 'PostgresSink', + 'kafka.api.key' = '${KAFKA_API_KEY}', + 'kafka.api.secret' = '${KAFKA_API_SECRET}', + 'connection.host' = 'postgresql-119942-0.cloudclusters.net', + 'connection.port' = '10091', + 'connection.user' = '${POSTGRES_USER}', + 'connection.password' = '${POSTGRES_PASSWORD}', + 'db.name' = 'default', + 'topics' = 'clickstream_cloud_output', + 'input.data.format' = 'AVRO', + 'input.key.format' = 'AVRO', + 'delete.enabled' = 'false', + 'ssl.mode' = 'prefer', + 'insert.mode' = 'UPSERT', + 'table.name.format' = 'clickstream_user_product_viewtime', + 'db.timezone' = 'UTC', + 'pk.mode' = 'record_value', + 'pk.fields' = 'USERID,PRODUCTID', + 'auto.create' = 'true', + 'auto.evolve' = 'true', + 'quote.sql.identifiers' = 'ALWAYS', + 'batch.sizes' = '3000', + 'tasks.max' = '1' +); +``` + +Here is the sample view of the `clickstream_user_product_viewtime` table in the Postgres database post the sink connector deployment. + +![PostgresTableOutputCrop.png](../assets/blog-images/e2e_ksql_connect_blog/PostgresTableOutputCrop.png) + +### Confluent Platform + +For Confluent Platform, we will use the Kafka Connect and KSQLDB docker images to deploy the respective clusters in a hybrid mode i.e. use the Kafka cluster in Confluent Cloud for Kafka topics. + +We will be deploying the streaming pipeline using the KSQL interactive mode by using KSQLDB CLI image to connect to the KSQLDB cluster. + +We will need to supply the credentials of the Schema Registry, which can be obtained from Confluent Cloud, in case of using Avro, JsonSchema or Protobuf formats for record key or value. + +The Kafka Connect image needs to be baked with the required connector jars before deployment. We will build a custom Kafka connect docker image by using the below `Dockerfile`, + +```docker +FROM confluentinc/cp-kafka-connect:7.3.1 + +RUN wget http://client.hub.confluent.io/confluent-hub-client-latest.tar.gz && tar -xzf confluent-hub-client-latest.tar.gz + +RUN chmod +x ./bin/confluent-hub + +ENV CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components" + +RUN ./bin/confluent-hub install --no-prompt confluentinc/kafka-connect-datagen:0.6.0 +RUN ./bin/confluent-hub install --no-prompt confluentinc/kafka-connect-jdbc:10.7.0 +``` + +Following is the docker compose file for deploying Kafka Connect, KSQLDB server, KSQLDB CLI and Postgres DB + +`docker-compose.yml` + +```yaml +version: '3.7' +services: + connect: + build: . + hostname: connect + container_name: sample-connect + ports: + - "8083:8083" + environment: + CONNECT_BOOTSTRAP_SERVERS: "${KAFKA_BOOTSTRAP_SERVER}" + CONNECT_SECURITY_PROTOCOL: SASL_SSL + CONNECT_SASL_MECHANISM: PLAIN + CONNECT_SASL_JAAS_CONFIG: | + org.apache.kafka.common.security.plain.PlainLoginModule required \ + username="${KAFKA_API_KEY}" \ + password="${KAFKA_API_SECRET}"; + CONNECT_REST_ADVERTISED_HOST_NAME: connect + CONNECT_REST_PORT: 8083 + CONNECT_GROUP_ID: test-connect-group + CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs + CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 3 + CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 + CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets + CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 3 + CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status + CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 3 + CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components" + CONNECT_TOPIC_CREATION_ENABLE: "true" + CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO" + CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_KEY_CONVERTER: "io.confluent.connect.avro.AvroConverter" + CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: "true" + CONNECT_KEY_CONVERTER_SCHEMA_REGISTRY_URL: "${SCHEMA_REGISTRY_URL}" + CONNECT_KEY_CONVERTER_BASIC_AUTH_CREDENTIALS_SOURCE: USER_INFO + CONNECT_KEY_CONVERTER_SCHEMA_REGISTRY_BASIC_AUTH_USER_INFO: "${SCHEMA_REGISTRY_API_KEY}:${SCHEMA_REGISTRY_API_SECRET}" + CONNECT_VALUE_CONVERTER: "io.confluent.connect.avro.AvroConverter" + CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: "true" + CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: "${SCHEMA_REGISTRY_URL}" + CONNECT_VALUE_CONVERTER_BASIC_AUTH_CREDENTIALS_SOURCE: USER_INFO + CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_BASIC_AUTH_USER_INFO: "${SCHEMA_REGISTRY_API_KEY}:${SCHEMA_REGISTRY_API_SECRET}" + CONNECT_CONNECTOR_CLIENT_CONFIG_OVERRIDE_POLICY: "All" + CONNECT_CONUSMER_SECURITY_PROTOCOL: SASL_SSL + CONNECT_CONUSMER_SASL_MECHANISM: PLAIN + CONNECT_CONUSMER_SASL_JAAS_CONFIG: | + org.apache.kafka.common.security.plain.PlainLoginModule required \ + username="${KAFKA_API_KEY}" \ + password="${KAFKA_API_SECRET}"; + CONNECT_PRODUCER_SECURITY_PROTOCOL: SASL_SSL + CONNECT_PRODUCER_SASL_MECHANISM: PLAIN + CONNECT_PRODUCER_SASL_JAAS_CONFIG: | + org.apache.kafka.common.security.plain.PlainLoginModule required \ + username="${KAFKA_API_KEY}" \ + password="${KAFKA_API_SECRET}"; + + ksqldb-server: + image: confluentinc/cp-ksqldb-server:7.3.1 + hostname: ksqldb-server + container_name: ksqldb-server-container + depends_on: + - connect + ports: + - "8088:8088" + # volumes: + # - ./ksql.sql:/opt/ksql/ksql.sql + environment: + KSQL_CONFIG_DIR: "/etc/ksql" + KSQL_AUTO_OFFSET_RESET: "earliest" + KSQL_BOOTSTRAP_SERVERS: "${KAFKA_BOOTSTRAP_SERVER}" + KSQL_HOST_NAME: ksqldb-server + KSQL_LISTENERS: "http://0.0.0.0:8088" + KSQL_CACHE_MAX_BYTES_BUFFERING: 0 + KSQL_KSQL_SCHEMA_REGISTRY_URL: "${SCHEMA_REGISTRY_URL}" + KSQL_KSQL_SCHEMA_REGISTRY_BASIC_AUTH_CREDENTIALS_SOURCE: USER_INFO + KSQL_KSQL_SCHEMA_REGISTRY_BASIC_AUTH_USER_INFO: "${SCHEMA_REGISTRY_API_KEY}:${SCHEMA_REGISTRY_API_SECRET}" + KSQL_KSQL_CONNECT_URL: "http://connect:8083" + KSQL_KSQL_LOGGING_PROCESSING_TOPIC_REPLICATION_FACTOR: 3 + KSQL_KSQL_LOGGING_PROCESSING_TOPIC_AUTO_CREATE: 'true' + KSQL_KSQL_LOGGING_PROCESSING_STREAM_AUTO_CREATE: 'true' + KSQL_LOG4J_ROOT_LOGLEVEL: INFO + KSQL_KSQL_SERVICE_ID: "test-ksql-cluster" + KSQL_KSQL_STREAMS_REPLICATION_FACTOR: 3 + KSQL_KSQL_INTERNAL_TOPIC_REPLICAS: 3 + KSQL_SECURITY_PROTOCOL: SASL_SSL + KSQL_SASL_MECHANISM: PLAIN + KSQL_SASL_JAAS_CONFIG: | + org.apache.kafka.common.security.plain.PlainLoginModule required \ + username="${KAFKA_API_KEY}" \ + password="${KAFKA_API_SECRET}"; + + ksqldb-cli: + image: confluentinc/cp-ksqldb-cli:7.3.1 + container_name: ksqldb-cli + entrypoint: /bin/sh + tty: true + + pg-database: + image: "postgres:11" + ports: + - "5432:5432" + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: password + POSTGRES_DB: postgres +``` + +The Kafka Connect and KSQLDB services will connect to Kafka cluster in Confluent Cloud using the bootstrap server and the SASL credentials provided as environment variables for the respective services. The Schema registry URL and its credentials are also provided for both the services. + +The config `CONNECT_TOPIC_CREATION_ENABLE: "true"` needs to be set up in Kafka Connect cluster to enable the input topic creation through Source connectors. + +Start the docker services in `build` mode to build the custom Kafka Connect image using the `Dockerfile` we created, + +```bash +docker-compose up -d --build +``` + +Once all the services are successfully up, attach to the KSQLDB CLI container’s bash, + +```bash +docker exec -it ksqldb-cli bash +``` + +Connect to the KSQLDB server inside the KSQLDB CLI container, + +```bash +ksql http://ksqldb-server:8088 +``` + +![KSQLdbCLI.png](../assets/blog-images/e2e_ksql_connect_blog/KSQLdbCLI.png) + +Deploy the Datagen Source connector, + +```sql +CREATE SOURCE CONNECTOR "ClickstreamDataGenSourceConnector" WITH ( + "connector.class"='io.confluent.kafka.connect.datagen.DatagenConnector', + "kafka.api.key"='${KAFKA_API_KEY}', + "kafka.api.secret"='${KAFKA_API_SECRET}', + "kafka.topic"='clickstream_cloud_input', + "output.data.format"='AVRO', + "quickstart"='SHOE_CLICKSTREAM', + "topic.creation.default.partitions"='1', + "topic.creation.default.replication.factor"='3', + "tasks.max"='1' +); +``` + +Create the Kafka stream from the input topic `clickstream_cloud_input`, + +```sql +CREATE OR REPLACE STREAM CLICKSTREAM_CLOUD_STREAM +(PRODUCT_ID VARCHAR, USER_ID VARCHAR, VIEW_TIME INT, +PAGE_URL VARCHAR, IP VARCHAR, TS BIGINT) +WITH (kafka_topic='clickstream_cloud_input', partitions=1, +value_format='AVRO', timestamp='TS'); +``` + +Create the aggregated table backed by a output topic `clickstream_cloud_output`, + +```sql +CREATE OR REPLACE TABLE USER_PRODUCT_VIEWTIME +WITH (kafka_topic='clickstream_cloud_output', partitions=1, +key_format='AVRO', value_format='AVRO') +AS SELECT USER_ID, PRODUCT_ID, AS_VALUE(USER_ID) AS USERID, +AS_VALUE(PRODUCT_ID) AS PRODUCTID, SUM(VIEW_TIME) AS TOTAL_VIEW_TIME, +WINDOWEND AS END_TIME, WINDOWSTART AS START_TIME +FROM CLICKSTREAM_CLOUD_STREAM WINDOW TUMBLING (SIZE 1 MINUTES, RETENTION 7 DAYS) +GROUP BY USER_ID, PRODUCT_ID; +``` + +Create the JDBC Postgres Sink Connector to sink data to the local Postgres database, + +```sql +CREATE SINK CONNECTOR "PostgresClickstreamSinkConnector" WITH ( + 'connector.class' = 'io.confluent.connect.jdbc.JdbcSinkConnector', + 'connection.url' = 'jdbc:postgresql://pg-database:5432/postgres?user=postgres&password=password&ssl=false', + 'topics' = 'clickstream_cloud_output', + 'input.data.format' = 'AVRO', + 'input.key.format' = 'AVRO', + 'delete.enabled' = 'false', + 'ssl.mode' = 'prefer', + 'insert.mode' = 'UPSERT', + 'table.name.format' = 'clickstream_user_product_viewtime', + 'db.timezone' = 'UTC', + 'pk.mode' = 'record_value', + 'pk.fields' = 'USERID,PRODUCTID', + 'auto.create' = 'true', + 'auto.evolve' = 'true', + 'quote.sql.identifiers' = 'ALWAYS', + 'batch.sizes' = '3000', + 'tasks.max' = '1', + 'consumer.override.sasl.mechanism' = 'PLAIN', + 'consumer.override.security.protocol'= 'SASL_SSL', + 'consumer.override.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"${KAFKA_API_KEY}\" password=\"${KAFKA_API_SECRET}\";' +``` + +## Conclusion + +In conclusion, building an end-to-end streaming pipeline using Kafka Connect and KSQLDB can be a powerful way to enable real-time data processing and analytics. By leveraging Kafka Connect for data ingestion and KSQLDB for data processing and analysis, developers can build complex streaming applications that can scale horizontally as needed. + +Deploying this architecture in a managed environment such as Confluent Cloud provides several benefits, including reduced infrastructure management overhead, automatic scaling, and built-in support for high availability and disaster recovery. However, this comes at the cost of higher pricing and potentially reduced flexibility in terms of configuration and customization. + +On the other hand, self-hosted deployment mode provides greater control and flexibility over the deployment and configuration of Kafka Connect and KSQLDB. However, this approach requires more infrastructure management and may require more upfront investment in terms of hardware and software resources. diff --git a/_posts/2024-03-13-Flink-Deployment-Patterns.md b/_posts/2024-03-13-Flink-Deployment-Patterns.md new file mode 100644 index 0000000000..a3175460df --- /dev/null +++ b/_posts/2024-03-13-Flink-Deployment-Patterns.md @@ -0,0 +1,141 @@ +--- +layout: post +title: "Flink Deployment Patterns" +authors: ashwin,avinash +categories: [Platform Engineering, Data, Infrastructure, Kafka, Apache Flink, Kubernetes] +image: assets/blog-images/flink_deployment/flink-application.png +featured: false +hidden: false +teaser: Overview of available deployment patterns for Flink Cluster +toc: true +--- +# Introduction + +Apache Flink is an open-source, unified stream-processing and batch-processing framework designed for high-performance, scalability, and accuracy. Apache Flink has a very large and vibrant community with 450+ contributors and 15500+ commits, supported by companies like Apple and Alibaba, that helps to ensure continual innovation. It powers stream processing platforms at many companies, including digital natives like Uber, Netflix, and Linkedin, as well as successful enterprises like ING, Goldman Sachs, and Comcast. + +Apache Flink supports high throughput and low event latency at the same time, fault-tolerance with *exactly-once* processing guarantees, event time and out of order processing etc. It has elegant and fluent APIs in Java, Scala and Python. It has libraries for Graph processing (batch), Machine Learning (batch), and Complex Event Processing (streaming). Flink is the de facto industry standard for stream processing + +# Cluster Framework + +![flink_architecture.jpg](../assets/blog-images/flink_deployment/flink_architecture.jpg) + +- Apache Flink jobs run on clusters, which are composed of two types of nodes: `TaskManagers` and `JobManagers`. +- While clusters typically consists of multiple `TaskManagers`, only reason to run multiple JobManagers is high availability. +- The jobs are *submitted* to `JobManager` by `FlinkClient`, that compiles user application into dataflow graph which is understandable by `JobManager`. +- `JobManager` then coordinates job execution: it splits the parallel units of a job to `TaskManagers`, manages heartbeats, triggers checkpoints, reacts to failures and much more. +- JobManager supports multiple job submission modes such as, + - Application mode + - Session mode + +## Application mode + +This mode creates one cluster per submitted application. A dedicated JobManager is started for submitting the job. The JobManager will only execute this job, then exit. The Flink application runs on the JobManager. With this architecture, the *Application Mode* provides the resource isolation and load balancing guarantees at the granularity of a whole application. + +## Session mode + +*Session mode* assumes an already running cluster and uses the resources of that cluster to execute any submitted application. Applications executed in the same (session) cluster use, and consequently compete for, the same resources. This has the advantage that you do not pay the resource overhead of spinning up a full cluster for every submitted job. + +# Deployment Patterns + +Flink is a versatile framework, supporting many different deployment patterns in a mix and match fashion. The available deployment patterns include, + +1. Standalone +2. YARN +3. Kubernetes +4. Cloud + +## Standalone + +The standalone mode is the most barebone way of deploying Flink. The Flink services such as JobManager and TaskManager are just launched as processes on the operating system. Flink runs on all UNIX like systems e.g. Linux, Mac OS X and Cygwin. Flink requires **Java 1.8.x** or higher installed. + +In Production environments, the Flink clusters have to be deployed for High-availability(HA) to ensure smooth processing. To enable HA for a standalone cluster, you have to use the ZooKeeper HA services. Flink leverages **ZooKeeper** for *distributed coordination* between all running JobManager instances. + +For more details on the deployment steps, please refer to this [link](https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/deployment/resource-providers/standalone/overview/#example-standalone-ha-cluster-with-2-jobmanagers). + +Standalone deployment in Flink offers simplicity and flexibility, making it suitable for various use cases ranging from small-scale development and testing environments to large-scale production deployments. However, it also requires manual management of resources and may not provide the same level of resource isolation and multi-tenancy capabilities as other deployment options like YARN or Kubernetes. + +## YARN + +YARN is a popular resource manager for running Hadoop jobs and other data processing applications such as MapReduce, Spark, Storm and Flink, among others. YARN (Yet Another Resource Manager) was born out of the idea to separate the concerns of resource management and job scheduling/monitoring. YARN achieves this with two components - Resource Manager and Node Manager. The Resource Manager accepts jobs from the client and schedules them to be run on the Node Manager. The Node Manager runs the application by creating multiple containers. Apache Flink can be [deployed](https://nightlies.apache.org/flink/flink-docs-master/docs/deployment/resource-providers/yarn/) using YARN by creating Flink’s JobManager and TaskManager in such containers. This allows to dynamically allocate and de-allocate TaskManager resources based on the job running. + +YARN allows deploying Flink applications in all deployment modes supported by Apache Flink including the deprecated Per-Job mode. Flink applications can be deployed in the Application Mode or in the Session Mode. YARN also allows submitting Flink jobs using the SQL client. This makes it an attractive choice for deploying Apache Flink if there are existing YARN clusters running or if there are plans to use other data processing systems in conjunction with Flink, allowing them to be deployed together. YARN can be scaled up or down depending on the load and provides token based security with Kerberos authentication. + +YARN is not particularly a great choice if you are not already in the Apache Hadoop ecosystem since there is a steep learning curve and requires expertise managing the additional configurations and settings, which can be difficult for those who are not familiar with YARN. + +## Kubernetes + +Kubernetes has grown to be the de-facto orchestrator for deploying Cloud Native workloads. Apache Flink comes with a [native integration with Kubernetes](https://nightlies.apache.org/flink/flink-docs-master/docs/deployment/resource-providers/native_kubernetes/) to deploy Flink. Similar to YARN, the native Kubernetes integration allows to dynamically allocate and de-allocate TaskManager resources based on the job running. Apart from the native integration, there are multiple Kubernetes Operators that allow creation of Flink Jobs declaratively. + +The native Kubernetes integration allows running Flink jobs without the hassle of writing YAML files and abstracting the Kubernetes complexity while providing the benefits of Kubernetes such as auto-healing and resource orchestration. This can be a good first step towards running Flink Jobs in Kubernetes while using the same clients for submitting Flink Jobs. Flink also offers [High Availability(HA)](https://nightlies.apache.org/flink/flink-docs-master/docs/deployment/ha/kubernetes_ha/) while deploying using the native Kubernetes integration. + +There are multiple open source Kubernetes Operators for Flink such as the official [Apache Flink Operator](https://github.com/apache/flink-kubernetes-operator), [Flinkk8soperator by Lyft](https://github.com/lyft/flinkk8soperator) and the [Flink on Kubernetes operator by Spotify](https://github.com/spotify/flink-on-k8s-operator). These operators allow declaratively defining the Flink cluster and creating Flink Jobs using a Custom Resource in Kubernetes. This implies, Flink Jobs can be natively managed using Kubernetes clients and operational governance can be implemented with GitOps. + +While there are several advantages to deploying Flink on Kubernetes such as high availability and scalability, it requires expertise in running and maintaining a Kubernetes cluster along with ensuring the Flink applications are fault tolerant with appropriate recovery mechanisms. + +## Cloud + +Fully managed services abstract away much of the complexity involved in deploying and managing Apache Flink clusters. Users don't need to worry about infrastructure provisioning, cluster configuration, or software updates, as these tasks are handled by the managed service provider. Deploy highly available and durable applications with Multi-AZ deployments and APIs for application lifecycle management. + +Managed services provide built-in monitoring and logging capabilities, allowing users to easily track the performance and health of their Flink applications. It also offers enhanced security features, such as encryption, access controls, and compliance certifications. Finally, managed services integrate seamlessly with other cloud services and data processing tools, allowing users to build end-to-end data pipelines more easily. + +# Challenges in self-managed Flink cluster + +While self managing a Flink cluster offers greater flexibility in terms of the APIs supported or the connectors, it comes with the operational hazard of maintaining a platform. As with any self managed system, self managing Flink would imply being responsible for the security of the system and scaling the system. While self managing the Flink clusters, the focus of the team would be more towards managing infrastructure than creating streaming applications. Using a managed service would allow to offload operational burdens to a provider. Certain managed services handle check-pointing as well, allowing the developer to purely focus on the streaming logic. + +# How Confluent’s fully managed Flink service help developers? + +Kafka has become the de facto standard for messaging system and similarly Flink is the de facto industry standard for stream processing. The Confluent offering for Apache Kafka and Apache Flink goes hand-in-hand to facilitate a rich-environment for real-time stream processing. + +Confluent Cloud provides a cloud-native, serverless service for Flink that enables simple, scalable, and secure stream processing that integrates seamlessly with Apache Kafka. Your Kafka topics appear automatically as queryable Flink tables, with schemas and metadata attached by Confluent Cloud. + +![https://docs.confluent.io/cloud/current/_images/flink-kafka-ecosystem.png](https://docs.confluent.io/cloud/current/_images/flink-kafka-ecosystem.png) + +Confluent Cloud for Apache Flink is a cloud-native service in a truest sense. Confluent’s fully managed Flink service enables you to: + +- Easily filter, join, and enrich your data streams with Flink +- Enable high-performance and efficient stream processing at any scale, without the complexities of managing infrastructure +- Experience Kafka and Flink as a unified platform, with fully integrated monitoring, security, and governance + +Confluent Cloud for Apache Flink let you focus fully on your business logic, encapsulated in Flink SQL [statements](https://docs.confluent.io/cloud/current/flink/concepts/statements.html#flink-sql-statements), and Confluent Cloud takes care of what’s needed to run them in a secure, resource-efficient and fault-tolerant manner. + +## Fully Managed + +Confluent Cloud always uses the latest Flink version. Any security patches to Flink runtime are applied seamlessly and automatically + +## Auto-scaling + +Flink SQL statements on Confluent Cloud are monitored continuously and [auto-scaled](https://docs.confluent.io/cloud/current/flink/concepts/autopilot.html#flink-sql-autopilot) to keep up with the throughput rate of their input topics. + +## Usage-based billing + +You pay only for what you use, not what you provision. Flink compute in Confluent Cloud is elastic: once you stop using the compute resources, they are deallocated, and you no longer pay for them. + +## Metadata Sync + +Kafka topics and schemas are always in sync with Flink, simplifying how you can process your data. Any topic created in Kafka is visible directly as a table in Flink, and any table created in Flink is visible as a topic in Kafka + +## Observability + +Confluent Cloud provides you with a curated set of metrics, exposing them through Confluent’s existing [metrics API](https://docs.confluent.io/cloud/current/monitoring/metrics-api.html#metrics-api). Confluent Cloud provides first-class integrations with New Relic, Datadog, Grafana Cloud, and Dynatrace + +## Security + +Confluent Cloud for Apache Flink has a deep integration with [Role-Based Access Control (RBAC)](https://docs.confluent.io/cloud/current/access-management/access-control/rbac/overview.html#cloud-rbac), ensuring that you can easily access and process the data that you have access to, and no other data. + +## Cloud Availability + +Confluent Cloud for Apache Flink is available on AWS, Azure, and GCP. Refer to this [link](https://docs.confluent.io/cloud/current/flink/index.html#af-long-is-everywhere) for the supported region wise list. + +Learn more about Confluent’s [cloud-native,serverless Apache Flink® service](https://cnfl.io/49m1KuN)—now +generally available on AWS, Azure, and Google Cloud. + +# Conclusion + +Apache Flink stands as a robust and versatile framework for real-time stream processing and batch processing with its fault tolerance, high throughput, low-latency processing capabilities, and seamless integration with various data sources. Managing Flink clusters involves dealing with infrastructure concerns such as deployment, scaling, and maintenance. By outsourcing this responsibility to managed services or cloud providers, businesses can redirect their focus towards their core competencies, such as developing innovative data processing pipelines tailored to their specific business needs. + +Confluent Cloud for Apache Flink enables businesses to build efficient and tailored data processing workflows without any infrastructure management hassles. It also seamlessly integrates with Confluent Kafka and Schema Registry. + +# References + +- [https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/deployment/overview/](https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/deployment/overview/) +- [https://docs.confluent.io/cloud/current/flink/index.html#stream-processing-with-af-long](https://docs.confluent.io/cloud/current/flink/index.html#stream-processing-with-af-long) diff --git a/_posts/2024-06-18-librdkafka-interceptors.md b/_posts/2024-06-18-librdkafka-interceptors.md new file mode 100644 index 0000000000..cc58699542 --- /dev/null +++ b/_posts/2024-06-18-librdkafka-interceptors.md @@ -0,0 +1,611 @@ +--- +layout: post +title: "Interceptors for Librdkafka" +authors: avinash +categories: [Data, Kafka, Librdkafka] +image: assets/blog-images/librdkafka_interceptors/KafkaInterceptors.png +featured: false +teaser: Building Kafka interceptors for librdkafka clients +--- + +Interceptors in Apache Kafka are plugins that intercept and potentially modify data that is being produced or consumed by Kafka clients. It allows the externalization of common functionality among clients without having to modify the underlying Kafka code. These interceptors can either be producer interceptors or consumer interceptors and act on a single message at a time. Interceptors hook into various stages of a produce or consume request, allowing access to the message and metadata through custom code. + +![kafka_interceptor.jpg](../assets/blog-images/librdkafka_interceptors/KafkaInterceptor.png) + +Interceptors play a critical role in the Apache Kafka ecosystem, with use cases around observability, logging, and simple transformations with Kafka producers and consumers. Some of the common use cases or implementations of interceptors are: + + + +1. Disaster Recovery: The [Consumer Timestamp Interceptor](https://docs.confluent.io/platform/current/multi-dc-deployments/replicator/replicator-failover.html#configuring-the-consumer-for-failover-timestamp-preservation) is used by consumers to translate the offsets of the source cluster to the offsets in the destination cluster based on the timestamp of the message in a replicated environment. +2. Monitoring: The [monitoring interceptor](https://docs.confluent.io/platform/current/control-center/installation/clients.html) allows users to monitor production and consumption of messages in the Confluent Control Center by producing telemetry to an additional topic for each message intercepted by the interceptor. +3. Observability: One of the most common use cases for interceptors is implementing distributed tracing for Kafka clients. The [Zipkin interceptor](https://github.com/openzipkin-contrib/brave-kafka-interceptor) enables tracing for Kafka producers and consumers and sends these traces to Zipkin. +4. Generic Tooling: Interceptors can also be used for general-purpose automation, such as auto-creating topics if it do not exist or verifying message integrity with signatures, etc. +5. Data transformation: Since interceptors can also potentially modify the Kafka message, it can be used to do simple transformations on the message before producing or consuming. + + +## Building interceptors + +The official Apache Kafka client library produces an interface for writing producer and consumer interceptors in Java. These interceptors can be configured by setting the configuration `producer.interceptor.classes` for the producer interceptors and `consumer.interceptor.classes` for the consumer interceptors. + + +### Producer Interceptor + +The [ProducerInterceptor](https://kafka.apache.org/10/javadoc/org/apache/kafka/clients/producer/ProducerInterceptor.html) interface has the following abstract methods: + + + +* `ProducerRecord<K, V> onSend(ProducerRecord<K, V> record);` \ +The onSend method is called when the producer client calls the send() method. This method has access to the complete Producer Record and can mutate the record as well before returning it to the next stage of the producer lifecycle. +* `void onAcknowledgement(RecordMetadata metadata, Exception exception);` \ +The onAcknowledgement method is called when the broker returns the acknowledgement back to the client along with the metadata about the message, such as the message offset and partition. +* `void close();` \ +The close method is called when the interceptor is closed. This is typically for any cleanup or closing activity that needs to be performed. + +Since the ProducerInterceptor extends the interface Configurable, the following method is also available within the ProducerInterceptor: + + + +* `void configure(Map<String, ?> configs);` \ +The configure method is called when an instance of the producer interceptor is created. This is typically for instantiating any global values for the interceptor. + +ProducerInterceptor callbacks may be called from multiple threads. Interceptor implementation must ensure thread safety, if needed. + + +### Consumer Interceptor + +The [Consumernterceptor](https://kafka.apache.org/10/javadoc/org/apache/kafka/clients/consumer/ConsumerInterceptor.html) interface has the following abstract methods: + + + +* ConsumerRecords<K, V> onConsume(ConsumerRecords<K, V> records); \ +The onConsume method is called just before the records are returned by the broker due to a poll() call from the consumer. The method has access to the consumer records returned in the batch and can potentially modify the records before returning them. +* void onCommit(Map<TopicPartition, OffsetAndMetadata> offsets); \ +The onCommit method is called when the consumer offsets are committed to the broker. It is provided with details about the topic partition and offsets that were committed. +* void close(); \ +Similar to the ProducerInterceptor, the close method can be used for any cleanup or closing activity that needs to be performed when the interceptor is closed. + +Similar to the ProducerInterceptor, the ConsumerInterceptor also extends the interface `Configurable` and provides the configure method for any global instantiation. Unlike the ProducerInterceptor, the ConsumerInterceptor is called from the same thread that invokes `KafkaConsumer.poll(long)`. + +These Producer and Consumer Interceptor APIs are natively only exposed in the official Java client that is part of the Apache Kafka project. However, there are multiple other client libraries out there for languages apart from Java. [Librdkafka](https://github.com/confluentinc/librdkafka) is one such popular Kafka client library written in C and C++. There are multiple other client libraries in languages such as Python, Go, C#, PHP, Rust, Haskell, and Javascript that are built using librdkafka. One would want to extend the client capabilities in these languages as well and build an interceptor. However, the interceptor API is currently only available for C language clients. + +In this blog, we will go over the step-by-step process of developing custom interceptors for both producers and consumers using librdkafka for C and other languages that use librdkafka. + + +## Building interceptors with librdkafka + +Similar to the Java library, the librdkafka library exposes various functions in C to implement a producer or consumer interceptor at various stages of the produce/consume request. + +For the producer interceptor, the following functions are made available: + + + +* on_send +* on_acknowledgement + +For the consumer interceptor, the following functions are made available: + + + +* on_consume +* on_commit + +Once these methods are implemented, a shared object file needs to be created, which can then be used by the librdkafka clients as plugins. Although the interceptors need to be written in C, they can be used by any librdkafka client in any language supported. + +Interceptors can be implemented along with the client application code by defining the above mentioned methods using the direct API if the client language is C or by being implemented as an external standalone plugin library that uses the direct APIs. If the client language is not C, the latter approach needs to be taken by writing the interceptor code as an external plugin in C and integrating the shared object into the client’s plugin library. + + +#### Key Gotcha + +> While the Java library for building interceptors allows for mutating the messages, the librdkafka API for interceptors does not allow modification of messages. Interceptors built with librdkafka are read-only interceptors. However, librdkafka interceptors can add, remove, or modify headers in a message. + + +### Use Case + +We will be building a producer and consumer interceptor to perform an audit of the message trail from producers and consumers. These interceptors will capture audit information about a message produced or consumed and send the audit details to another topic. The audit topic can be in the same cluster where the original production or consumption is happening or in a different cluster. + +When the message is initially produced, a correlation ID is generated and added to the message as a header. This correlation ID serves as an identifier for tracking the message across the broker and consumer(s). This allows us to build an audit trail for each individual message and track consumption or data loss. + +![kafka_interceptor.jpg](../assets/blog-images/librdkafka_interceptors/AuditLibrdkafkaInterceptors.png) + +We will be demonstrating the following functionality while building the interceptors: + + + +* Intercepting a message when a message is produced +* Intercepting the metadata when the broker sends an acknowledgement back to the producer +* Defining custom configuration for the interceptor +* Intercepting the messages before it is consumed by the consumer +* Initializing global configuration for an interceptor and cleaning up the configuration + +We will not be covering the scenarios for intercepting the offsets when committed by the consumer or when the configuration is copied instead of creating a new configuration for Kafka clients. + + +### Producer Interceptor with librdkafka + + +#### Interceptor instance + +Each interceptor plugin has an Interceptor instance, which is created either when the plugin (interceptor) is loaded or when the configuration which was previously set is copied. + +We will be declaring the interceptor instance in a struct called ici: \ + + + +```c +struct ici { + rd_kafka_conf_t *audit_producer_conf; /**< Interceptor-specific config */ + char *audit_topic; +}; +``` + + +The Interceptor instance can contain any number of members of any type. Here we are declaring an interceptor instance with 2 members: + + + +* `rd_kafka_conf_t *audit_producer_conf;` - Used to store the producer configuration for the audit topic +* `char *audit_topic;` - Used to store the name of the audit topic + + +#### conf_init + +When `plugin.library.paths` is set, the plugin’s conf initializer is called. In the initializer function, we can create and define global objects and functions for the interceptor. In the init function, we will instantiate the Interceptor instance `ici` along with defining the interceptor methods to be called for various interceptor lifecycle stages, such as new interceptor, configuration set, configuration copied (duplicated) and configuration destroyed. + + +```c +static void conf_init0(rd_kafka_conf_t *conf) { + struct ici *ici; + + ici = calloc(1, sizeof(*ici)); + ici->audit_producer_conf = rd_kafka_conf_new(); + + rd_kafka_conf_interceptor_add_on_new(conf, __FILE__, on_new, ici); + rd_kafka_conf_interceptor_add_on_conf_set(conf, __FILE__, on_conf_set, ici); + rd_kafka_conf_interceptor_add_on_conf_destroy(conf, __FILE__, + on_conf_destroy, ici); +} + +rd_kafka_resp_err_t conf_init(rd_kafka_conf_t *conf, + void **plug_opaquep, + char *errstr, + size_t errstr_size){ + conf_init0(conf); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + +Here, we are initializing the `ici` object along with instantiating the members of the `ici` struct. Along with that, we are defining the methods for when a new interceptor is created, when the configuration is set, and when the configuration is destroyed, i.e., when the interceptor is destroyed. + +The conf_init method has access to the complete client configuration, which is passed to the on_new, on_conf_set and on_conf_destroy methods. + + +#### on_new + +The on_new method is called when rd_kafka_new() or equivalent methods in other languages are called. We will use this method to set up the interceptor. + + +```c +static rd_kafka_resp_err_t on_new(rd_kafka_t *rk, + const rd_kafka_conf_t *conf, + void *ic_opaque, + char *errstr, + size_t errstr_size) { + struct ici *ici = ic_opaque; + + rd_kafka_interceptor_add_on_send(rk, __FILE__, on_send, ici); + rd_kafka_interceptor_add_on_acknowledgement(rk, __FILE__, + on_acknowledgement, ici); + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + +Here, we define the methods for the 2 important events of a producer interceptor: on_send and on_acknowledgement. Both these methods will have access to the interceptor object. + + +#### on_conf_set + +The on_conf_set method is called for each configuration set by the kafka client. The method should return one of `RD_KAFKA_CONF_OK`, `RD_KAFKA_CONF_INVALID` or `RD_KAFKA_CONF_UNKNOWN`. + + + +* If the configuration is a configuration that is expected by the interceptor and not necessary for the client, the method should return `RD_KAFKA_CONF_OK`. +* If the method returns `RD_KAFKA_CONF_UNKNOWN`, librdkafka will check if the configuration is valid with other interceptors and, finally, with the list of other configurations that librdkafka recognizes. +* Returning `RD_KAFKA_CONF_INVALID` will cause the client to fail. This should only be returned if the configuration name is expected by the interceptor but the value is not as expected. + +```c +static rd_kafka_conf_res_t on_conf_set(rd_kafka_conf_t *conf, + const char *name, + const char *val, + char *errstr, + size_t errstr_size, + void *ic_opaque) { + + struct ici *ici = ic_opaque; + int level = 3; + const char *prefix = "audit."; + + + if (strcmp(name, "audit.topic")==0) { + ici->audit_topic = strdup(val); + return RD_KAFKA_CONF_OK; + } + + + if (strncmp(prefix, name, strlen(prefix)) == 0) { + size_t prop_len = strlen(name)-strlen(prefix); + char *prop = (char *)malloc((prop_len + 1) * sizeof(char)); + strncpy(prop, name+strlen(prefix), prop_len); + rd_kafka_conf_set(ici->audit_producer_conf, prop, val, errstr, errstr_size); + return RD_KAFKA_CONF_OK; + } + else { + /* UNKNOWN makes the conf_set() call continue with + * other interceptors and finally the librdkafka properties. */ + return RD_KAFKA_CONF_UNKNOWN; + } + + return RD_KAFKA_CONF_UNKNOWN; +} +``` + + + +Here, we are checking if the configuration name is “audit.topic” or if it starts with “audit.” before returning `RD_KAFKA_CONF_UNKNOWN` if it does not match the criteria. If the criteria is met, the interceptor instance ici is updated with the values. + + +#### on_conf_destroy + +The `on_conf_destroy` method is called when rd_kafka_conf_destroy() or equivalent methods in other languages are called. We will use this method to clean up and free memory for any objects that were created as part of the interceptor. \ + + + +```c +static void ici_destroy(struct ici *ici) { + if (ici->audit_producer_conf) + free(ici->audit_producer_conf); + if (ici->audit_topic) + free(ici->audit_topic); + free(ici); +} + +static rd_kafka_resp_err_t on_conf_destroy(void *ic_opaque) { + struct ici *ici = ic_opaque; + ici_destroy(ici); + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + + +#### on_send + +The `on_send` method is called when the client application code calls the produce() method. The on_send method has access to the top level object `rd_kafka_t` and the message produced `rd_kafka_message_t`. The method also has access to the interceptor instance ici for any global variables. + + +```c +static rd_kafka_resp_err_t on_send(rd_kafka_t *rk, rd_kafka_message_t *rkmessage, void *ic_opaque) { + struct ici *ici = ic_opaque; + if (!rkmessage || !rkmessage->payload) { + fprintf(stderr, "No message payload to process.\n"); + return RD_KAFKA_RESP_ERR_NO_ERROR; + } + + char *correlation_id = generate_correlation_id(); + + rd_kafka_headers_t *headers; + rd_kafka_message_headers(rkmessage, &headers); + if (!headers) { + headers = rd_kafka_headers_new(8); + } + + rd_kafka_header_add(headers, "correlation_id", -1, correlation_id, -1); + + rd_kafka_message_set_headers(rkmessage, headers); + free(correlation_id); + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + +We are fetching the message headers, and if there are no message headers, we are initializing it. Finally, we set the header `correlation_id` with the random UUID generated. + + +#### on_acknowledgement + +The `on_acknowledgement` method is called when the broker returns the message acknowledgement with the message metadata, such as offset and partition of the message in the broker, along with the complete message produced. + + +```c +rd_kafka_resp_err_t on_acknowledgement(rd_kafka_t *rk, + rd_kafka_message_t *rkmessage, + void *ic_opaque) { + struct ici *ici = ic_opaque; + + const void *correlation_id; + size_t correlation_id_size; + + const rd_kafka_conf_t *producer_conf = rd_kafka_conf(rk); + char client_id[512]; + size_t client_id_size = sizeof(client_id); + rd_kafka_conf_get(producer_conf, "client.id", client_id, &client_id_size); + const char *topic = rd_kafka_topic_name(rkmessage->rkt); + rd_kafka_headers_t *headers; + rd_kafka_message_headers(rkmessage, &headers); + if (!headers) { + headers = rd_kafka_headers_new(8); + } + + if (rd_kafka_header_get(headers, 0, "correlation_id", &correlation_id, &correlation_id_size) == RD_KAFKA_RESP_ERR_NO_ERROR) { + time_t rawtime; + struct tm * timeinfo; + + time ( &rawtime ); + timeinfo = localtime ( &rawtime ); + // Send the message to an audit topic using a separate producer instance + rd_kafka_t *audit_rk = init_audit_producer(ici); + if (audit_rk) { + char *json_payload = generate_json_payload(asctime(timeinfo), correlation_id, topic, client_id, rkmessage->partition, rkmessage->offset); + rd_kafka_producev( + audit_rk, + RD_KAFKA_V_TOPIC(ici->audit_topic), + RD_KAFKA_V_MSGFLAGS(RD_KAFKA_MSG_F_COPY), + RD_KAFKA_V_VALUE(json_payload, strlen(json_payload)), + RD_KAFKA_V_KEY(correlation_id, correlation_id_size), + RD_KAFKA_V_END + ); + rd_kafka_flush(audit_rk, 10000); // Wait for messages to be delivered + rd_kafka_destroy(audit_rk); // Clean up the audit producer + } + } + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + +Using the top-level `rd_kafka_t` object, we can fetch the producer configuration, such as the client ID, and using the `rd_kafka_message_t` object, we can fetch the headers for the correlation ID. We will instantiate a producer using the audit producer configuration stored in the interceptor object `ici`. Finally, we will produce the audit JSON message to the audit topic before cleaning up the resources created. + + +#### Helper functions + +There are 2 helper functions for generating a random UUID, which will be used as the correlation id and for generating a JSON payload for the audit message. For the sake of brevity, we will omit the code in this blog but the complete code can be found on [Github](https://github.com/Platformatory/libdrdkafka-interceptors/blob/7d99698479119e5df931304a5a88ee3838a4c510/audit_producer_interceptor.c#L15). + + +```c +char *generate_correlation_id() { + // Omitted for brevity +} + +char *generate_json_payload(const char *timestamp, const char *correlation_id, + const char *topic, const char *client, int partition, long offset) { + // Omitted for brevity +} +``` + + + +#### Error Handling + +Every method in the librdkafka interceptor API expects an error code to be returned. `RD_KAFKA_RESP_ERR_NO_ERROR` is to be returned if there are no errors in the method. The full list of error codes can be found in the [librdkafka source code](https://github.com/confluentinc/librdkafka/blob/10824053b4832435f9fd47d72c63ae7dd14aa798/src/rdkafka.h#L281). + + +#### Compiling + +We will need [librdkafka](https://github.com/confluentinc/librdkafka), [jannson](https://github.com/akheron/jansson) and [libuuid](https://linux.die.net/man/3/libuuid) installed in the environment where the interceptor will be compiled. + +For ubuntu: + + +```bash +sudo apt install librdkafka-dev libjansson-dev uuid-dev +``` + + +GCC will be used to build a shared object file after compiling the C interceptor code. + + +```bash +gcc -o audit_producer_interceptor.so -shared -fPIC audit_producer_interceptor.c -lrdkafka -ljansson -luuid +``` + + + +#### Testing + +Once the shared object is built, we can plug it into the client application code. To test the interceptor, we will be using Python with the [confluent-kafka-python](https://github.com/confluentinc/confluent-kafka-python) library, which is a wrapper for librdkafka in Python. + + +```python +from confluent_kafka import Producer +import random +import os + +conf = { + 'bootstrap.servers': 'localhost:9092', + 'plugin.library.paths': os.getcwd()+'/audit_producer_interceptor.so', # Ensure correct path + 'client.id': 'demo-producer', + 'audit.bootstrap.servers': os.environ.get('AUDIT_BOOTSTRAP_SERVER'), + 'audit.sasl.mechanism': 'PLAIN', + 'audit.security.protocol': 'SASL_SSL', + 'audit.sasl.username': os.environ.get('AUDIT_SASL_USERNAME'), + 'audit.sasl.password': os.environ.get('AUDIT_SASL_PASSWORD'), + 'audit.topic': 'audit_topic', +} + +def main(): + + producer = Producer(conf) + + def delivery_report(err, msg): + if err is not None: + print(f"Message delivery failed: {err}") + else: + print(f"Message delivered to {msg.topic()} [{msg.partition()}]") + + # Produce a message + producer.produce('test_topic', "Hello world! Here's a random number - "+str(random.randint(1, 2100)), callback=delivery_report) + +if __name__ == "__main__": + main() +``` + + +Notice the `plugin.library.paths` which is set to the path of the shared object previously compiled. The shared object can also be placed in `/usr/local/lib` or `/usr/lib` based on the librdkafka version and the `plugin.library.path` can be set to just the name of the shared object - `audit_producer_interceptor.` + +The audit producer and topic configuration is defined with the `audit.` prefix. It can be the same Kafka cluster or an entirely different cluster. + +Once we run the python application, we should be able to consume from the audit topic. An example of the message from the audit topic - + + +```json +{ + "timestamp": "Fri Jun 7 13:23:43 2024", + "correlation_id": "c0613861-f1a9-47f0-9b92-09a2346fe250", + "action": "produce", + "topic": "test_topic", + "client": "ksb-demo-producer", + "partition": 0, + "offset": 13 +} +``` + + +Complete code for the producer interceptor can be found on [GitHub](https://github.com/Platformatory/libdrdkafka-interceptors/blob/main/audit_producer_interceptor.c) along with the code for the [test client](https://github.com/Platformatory/libdrdkafka-interceptors/blob/main/test/producer.py). + + +### Consumer Interceptor with librdkafka + +A consumer interceptor can be built similar to the producer interceptor with the same set of APIs except for the consumer specific hooks such as `on_consume` and `on_commit` instead of `on_send` and `on_acknowledgement`. + +A consumer interceptor has an Interceptor instance, which was defined as ici in the producer interceptor. The interceptor’s lifecycle starts with the conf_init method, where we define the on_new, on_conf_set and on_conf_destroy methods, similar to the producer interceptor. In the on_new method, we define the methods for on_consume and on_commit. + + +#### on_consume + +The `on_consume` method is called just before the consumer client receives the messages fetched through the `rd_kafka_consumer_poll()` method or equivalent method in other client language libraries. The on_consume method is called for each individual message, even if the consumer fetches multiple message for each poll. The on_consume method has access to the top level object `rd_kafka_t` and the message produced, `rd_kafka_message_t`. The method also has access to the interceptor instance ici for any global variables. + + +```c +static rd_kafka_resp_err_t on_consume(rd_kafka_t *rk, rd_kafka_message_t *rkmessage, void *ic_opaque) { + struct ici *ici = ic_opaque; + if (!rkmessage || !rkmessage->payload) { + printf("No message payload to process.\n"); + return RD_KAFKA_RESP_ERR_NO_ERROR; + } + + rd_kafka_headers_t *headers = NULL; + if (rd_kafka_message_headers(rkmessage, &headers) == RD_KAFKA_RESP_ERR_NO_ERROR) { + const void *correlation_id; + size_t correlation_id_size; + + const rd_kafka_conf_t *consumer_conf = rd_kafka_conf(rk); + char group_id[512]; + size_t group_id_size = sizeof(group_id); + rd_kafka_conf_get(consumer_conf, "group.id", group_id, &group_id_size); + const char *topic = rd_kafka_topic_name(rkmessage->rkt); + if (rd_kafka_header_get(headers, 0, "correlation_id", &correlation_id, &correlation_id_size) == RD_KAFKA_RESP_ERR_NO_ERROR) { + time_t rawtime; + struct tm * timeinfo; + + time ( &rawtime ); + timeinfo = localtime ( &rawtime ); + + + + rd_kafka_t *audit_rk = init_audit_producer(ici); + if (audit_rk) { + char *json_payload = generate_json_payload(asctime(timeinfo), correlation_id, topic, group_id, rkmessage->partition, rkmessage->offset); + rd_kafka_producev( + audit_rk, + RD_KAFKA_V_TOPIC(ici->audit_topic), + RD_KAFKA_V_MSGFLAGS(RD_KAFKA_MSG_F_COPY), + RD_KAFKA_V_VALUE(json_payload, strlen(json_payload)), + RD_KAFKA_V_KEY(correlation_id, correlation_id_size), + RD_KAFKA_V_END + ); + rd_kafka_flush(audit_rk, 10000); // Wait for messages to be delivered + rd_kafka_destroy(audit_rk); // Clean up the audit producer + } + } else { + fprintf(stderr, "AuditInterceptor: Correlation ID header missing.\n"); + } + } else { + fprintf(stderr, "AuditInterceptor: No headers found.\n"); + } + + return RD_KAFKA_RESP_ERR_NO_ERROR; +} +``` + + +Similar to the producer interceptor’s on_acknowledgement method, we instantiate a producer for the audit message and send a JSON message to the audit topic. + + +#### on_commit + +The `on_commit` method is called when the offsets are committed by the consumer. The on_commit method has access to the top-level object rd_kafka_t and the offsets that were committed. The method also receives the interceptor instance and any error that was returned during the offset commit. + +Since the on_commit method does not have the message and the headers of the message, we will not be able to generate an audit message. The audit message requires the correlation ID that is part of the message header. The example will not include an implementation of the on_commit method. + + +#### Compiling + +Similar to the producer interceptor, we will be using gcc to compile the interceptor and generate a shared object. + + +```bash +gcc -o audit_consumer_interceptor.so -shared -fPIC audit_consumer_interceptor.c -lrdkafka -ljansson +``` + + + +#### Testing + +We can use the following python code to test the consumer interceptor - + + +```python +from confluent_kafka import Consumer +import os + +conf = { + 'bootstrap.servers': 'localhost:9092', + 'group.id': 'ksb-demo-group', + 'auto.offset.reset': 'earliest', + 'plugin.library.paths': os.getcwd()+'/audit_consumer_interceptor', # Ensure correct path + 'audit.bootstrap.servers': os.environ.get('AUDIT_BOOTSTRAP_SERVER'), + 'audit.sasl.mechanism': 'PLAIN', + 'audit.security.protocol': 'SASL_SSL', + 'audit.sasl.username': os.environ.get('AUDIT_SASL_USERNAME'), + 'audit.sasl.password': os.environ.get('AUDIT_SASL_PASSWORD'), + 'audit.topic': 'audit_topic', +} + +consumer = Consumer(conf) +consumer.subscribe(['test_topic']) + +try: + while True: + msg = consumer.poll(timeout=1.0) + if msg is None: + continue + if msg.error(): + print("Consumer error: {}".format(msg.error())) + continue + print(f'Received message: {msg.value().decode("utf-8")}') +finally: + consumer.close() +``` + + +The interceptor is declared using the `plugin.library.paths` configuration. Once we run the application, the interceptor will produce a message to the audit topic. + + +## Conclusion + +Although not every client language library built using librdkafka supports building interceptors natively, we can work around it by building interceptors in C and plugging it into the respective client language code using the configuration `plugin.library.paths`. Interceptors drive various use cases as mentioned earlier but should only be used to perform non-blocking operations, especially in the `on_send` and `on_consume` methods since these can contribute to the latency experienced by the client application. + +The complete source code and the test scripts can be found on [GitHub](https://github.com/Platformatory/libdrdkafka-interceptors). The librdkafka [test cases for interceptors](https://github.com/confluentinc/librdkafka/blob/master/tests/interceptor_test/interceptor_test.c) are also an excellent source for examples of writing interceptors for librdkafka. \ No newline at end of file diff --git a/_posts/2024-07-01-Chargeback-for-Kafka-clusters.md b/_posts/2024-07-01-Chargeback-for-Kafka-clusters.md new file mode 100644 index 0000000000..b461bcfd8d --- /dev/null +++ b/_posts/2024-07-01-Chargeback-for-Kafka-clusters.md @@ -0,0 +1,376 @@ +--- +layout: post +title: "Chargeback for Kafka clusters" +authors: ashwin +categories: [Platform Engineering, Infrastructure, Kafka, Cost] +image: assets/blog-images/kafka_chargeback_blog/TitleImage.jpg +featured: false +hidden: false +teaser: Solving for cost attribution and chargeback in Multi-tenant Kafka clusters +toc: true +--- +# Introduction + +Kafka is the most popular streaming platform, used by more than 80% of the Fortune 100 companies. Several of these enterprises run a number of Kafka clusters, that are (usually) operated by a data / streaming platform team. + +One concern that emerges for such platform teams is to understand usage patterns and attribute costs to their internal customers, for both cost transparency and profit. On a cloud service such as Confluent cloud, it is relatively simple to crunch these stats out of the metrics API. But for other deployments, this can be a somewhat tough problem to solve. + +To begin with there are two broad archetypes for how Kafka platform teams manage. + +- Single-tenant clusters: This is pretty simple. All costs are passed on to the tenant. +- Multi-tenant clusters: Every tenant is essentially being provided a topic as a service. They may have multiple producers and consumers. + +How do we solve for the latter in particular? Let’s explore. + +# Kafka Operational Costs + +Let’s begin by figuring out the costs of running Kafka. This is not magic and will vary for each organization. The costs can be majorly differentiated into 2 categories. They are, + +- Fixed Costs +- Variable Costs + +*Fixed Costs* are the costs which are incurred to keep the Kafka cluster operational. Irrespective of the usage of the Kafka cluster, there are costs associated with running a Kafka cluster. This includes, + +- Compute unit costs - Kafka Brokers, Zookeeper, Schema Registry nodes etc. +- Tooling costs - Monitoring tool, SIEM tool etc. +- Personnel costs - Kafka administrators, support team etc. + +*Variable Costs* are the costs which are subject to vary as per the usage of the Kafka cluster. Generally, this cost will grow as the scale of the data in a organization grows. This includes, + +- Network costs - Throughput, Max. Connections etc. +- Storage costs - Local Storage, Tiered storage etc. + +Therefore, the fully loaded cost of a Kafka cluster is pretty subjective. Charging internal customers is a little tricky. + +# Chargeback Models + +When we built this Chargeback solution, we scoped for three models, + +- Cost Centre +- Resource Usage Based +- Client Usage Based + +## Cost Centre + +In this model, we charge the tenants a fraction of the total costs incurred for keeping the Kafka cluster operational. The cost-split among the tenants are determined by their usage of the Kafka cluster when compared to the total usage of the Kafka cluster. For example, assume there are 5 teams using a Kafka cluster which incurs a monthly cost of $10,000 and clocks 10 Mbps of average throughput per broker. If every team contributes to 2 Mbps of average throughput per broker, then each team will be liable to pay $2000 for Kafka services. + +Cost Centre model enables billing the tenants based on a *fair share* of the total cost. This model would fit best in scenarios where the Kafka cluster utilization is not high. Generally, the Kafka clusters are over-provisioned to account for growth over time but the cluster capacity utilization does not grow fast enough. So, splitting costs based on relative usage is fair to the customers. + +## Resource Usage Based + +In this model, we charge the tenants based on their absolute usage of the Kafka resources. The Kafka resources which will be billed include, + +- Partitions +- Disk Storage +- Broker Throughput + +The above mentioned resources are the ones which can be tracked easily and attributed to the operational costs. + +A per-unit cost for each of the above mentioned resources will need to be calculated by doing a thorough load testing on Kafka clusters. + +The tenants pays only for the resources utilized and not based on total costs of running the Kafka cluster. This model would fit well for scenarios with high Kafka cluster utilization. When the cluster capacity utilization is high, it is best adopt a *pay-for-what-you-use* strategy as the costs are generally justified because of high usage. + +## Client Usage Based + +In this model, we charge the tenants based on their clients’ throughput numbers.This is similar to the *Resource Usage Based* model but we charge the tenants for the amount of data they send and/or read from the Kafka cluster. + +There are 3 sub types in this model based on who pays, + +- Producer Pays - *Only Producer clients pay for the data sent to the cluster* +- Consumer Pays - *Only Consumer clients pay for the data read from the cluster* +- Producer and Consumer Pays - *Both Producer and Consumer clients pay for the data sent and read by them respectively* + +Client throughput is measured as *bytes-per-second* in the Kafka world. Therefore, a unit *bytes-per-second* cost for both write and read operations will need be predefined by the platform team. Unit costs can be calculated based on the observed monthly average client throughput. + +This model would fit well in scenarios with very high Kafka cluster utilization over long periods of time. The advantage of this model is it offers simple pricing strategy for the tenants by paying for what they send and read rather than usage of Kafka related resources. + +# Solution + +Before we get to the solution, it is important to understand the considerations involved, + +- Avoid introducing additional costs as much as possible +- Avoid additional development efforts as much as possible +- Make use of existing capabilities of Kafka +- Use tools which are open-source and widely used in Kafka ecosystem + +The solution we built uses the Kafka JMX metrics, Prometheus and Grafana. Let’s take a look at how it works. + +Kafka JMX metrics provides a veritable list of useful metrics which gives us a detailed information on the operations of the Kafka cluster. Some of the useful broker-side metrics include, + +- [kafka_server_brokertopicmetrics_bytesinpersec](https://docs.confluent.io/platform/current/kafka/monitoring.html#bytesinpersec) +- [kafka_log_log_size](https://docs.confluent.io/platform/current/kafka/monitoring.html#size) +- [kafka_cluster_partition_replicascount](https://docs.confluent.io/platform/current/kafka/monitoring.html#replicascount) +- [kafka_server_produce_byte_rate](https://docs.confluent.io/platform/current/kafka/monitoring.html#request-quota) +- [kafka_server_fetch_byte_rate](https://docs.confluent.io/platform/current/kafka/monitoring.html#request-quota) + +**Note:** We will only be using the Broker-side metrics to do the Chargeback calculation as this is something which will be always available to the platform team managing Kafka clusters. + +The throughput rates, storage usage and partitions count can be inferred from the JMX metrics. But, we will need to attribute the clients or topics to a customer. This will be done by tagging the clients and/or topics to a Chargecode which will be unique for a tenant. + +*Chargecode* is a user-defined unique identifier for a tenant associated with a specific Kafka cluster. Chargecode definition will include match configuration which defines the topics and/or clients associated with that tenant in that specific Kafka cluster. Please note the Client quotas will need to predefined to capture the client specific throughput rate on the broker side. + +Prometheus will be used to scrape the relevant JMX metrics and re-tag them with the Chargecode as per the defined Chargecode configuration. Prometheus enables metric re-tagging through “metric_relabel_configs” definition. + +Grafana dashboards are created to visualize the usage and cost per Chargecodes by querying the Prometheus database. The dashboards will demonstrate the Bill per chargecode, Top N Chargecodes based on Storage, Partitions and Throughput usage etc. The dashboards are meant to enable the platform engineers in an organization to make an informed decision on managing resources and distributing costs + +# Configurations + +## Chargeback Model Configuration + +Let’s look at how to configure a Chargeback model for billing tenants in a Kafka cluster. Below is a sample configuration. + +```yaml +clusters: + - cluster1: + cluster_name: Cluster1 # Common Key with Chargecode configuration + cluster_id: 12345 # Unique cluster ID + throughput_units: bytes + currency_code: USD + total_costs: 5000 + total_capacity: + throughput_tps: 1000 + storage: 100000000 + partitions_max: 1000 + chargeback_model: cost_center # The model type + chargeback_config: + partition_unit_rate: 0.05 + throughput_unit_rate: 0.1 + storage_unit_rate: 0.001 + + - cluster3: + cluster_name: Cluster3 + cluster_id: 56789 + throughput_units: bytes + currency_code: USD + total_costs: 5000 + total_capacity: + throughput_tps: 1000 + storage: 100000000 + partitions_max: 1000 + chargeback_model: usage_based + chargeback_config: + partition_unit_rate: 0.05 + throughput_unit_rate: 0.1 + storage_unit_rate: 0.001 + + - cluster2: + cluster_name: Cluster2 + cluster_id: 98765 + throughput_units: bytes + currency_code: USD + total_costs: 6000 + total_capacity: + throughput_tps: 1000 + storage: 100000000 + partitions_max: 1000 + chargeback_model: client_usage_based + chargeback_config: + usage_model: producer_pays # Supported models: producer_pays, consumer_pays, producer_consumer_pays + throughput_in_unit_rate: 0.15 + throughput_out_unit_rate: 0.05 +``` + +One Chargeback model per Kafka cluster. The fields per cluster include, + +- *cluster_name* - Unique name for the cluster. This will be the common key for both model and chargecode configuration. +- *cluster_id* - Unique identifier for internal identification. +- *throughput_units* - Unit for the throughput measurement. Either “bytes” or “records”. Only “bytes” for client usage based model. +- *currency_code* - The currency unit for the bill generated +- *total_costs* - Monthly total operational cost of the Kafka cluster +- *total_capacity* - Total capacity available for that Kafka cluster + - *throughput_tps* - Allowed maximum throughput transactions (bytes or records) per second + - *storage* - Allowed maximum storage in bytes + - *partitions_max* - Allowed maximum topic partitions count +- *chargeback_model* - The model type for the Chargeback calculation +- *chargeback_config* - The per-unit rates for various resources and client throughput + - *partition_unit_rate* - Unit price for a topic partition + - *storage_unit_rate* - Unit price for each storage byte + - *throughput_unit_rate* - Unit price for each byte or record written to a Broker + - *usage_model* - The sub-model type for Client usage based model. Valid values are “producer_pays”, “consumer_pays” and “producer_consumer_pays” + - *throughput_in_unit_rate* - Unit price for each byte or record sent by a producer client. Only applicable for Client usage based model. + - *throughput_out_unit_rate* - Unit price for each byte or record read by a consumer client. Only applicable for Client usage based model. + +## Chargecode Configuration + +Let’s look at how to define Chargecodes for billing tenants in a Kafka cluster. Below is a sample configuration. + +```yaml +chargecodes: + - chargecode_name: Chargecode1 + target_cluster: Cluster1 # Joining Key + match_config: + - entity: client_id # Either client_id or topic + match_type: prefix # Either prefix, regex or literal + match_expression: "clientA_" + - entity: topic + match_type: literal + match_expression: "important_topics" + + - chargecode_name: Chargecode2 + target_cluster: Cluster2 + match_config: + - entity: client_id + match_type: regex + match_expression: "^clientB_.+" + - entity: topic + match_type: literal + match_expression: "special_topics" + + - chargecode_name: Chargecode3 + target_cluster: Cluster3 + match_config: + - entity: client_id + match_type: literal + match_expression: "clientC" + - entity: topic + match_type: prefix + match_expression: "experimental_" +``` + +Ideally, one Chargecode configuration per team per cluster. There will be one or more Chargecodes associated with a Kafka cluster. The fields per Chargecode definition include, + +- *chargecode_name* - Unique name for the Chargecode. Generally, the team name will be used +- *target_cluster* - The Kafka cluster to which the Chargecode will be associated +- *match_config* - The topic or client mapping associated with the tenant + - *entity -* Entity type. Either “topic” or “client_id” + - *match_type* - Matching logic. Either “literal”, “prefix” or “regex” + - *match_expression* - The resource name to match + +## Prometheus Configuration + +The defined Chargecode configuration will be converted into a “*metric_relabel_configs*” in the Prometheus configuration. Below is a sample Prometheus configuration for one Kafka cluster with Chargecode mapping defined under the “metric_relabel_configs” section. + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - /etc/prometheus/rules.yml + +scrape_configs: + - job_name: 'kafka-broker' + scrape_interval: 5s + metrics_path: / + static_configs: + - targets: ['kafka1:9101','kafka2:9102','kafka3:9103'] + labels: + env: "dev" + cluster: "Cluster1" + relabel_configs: + - source_labels: [__address__] + target_label: hostname + regex: '([^:]+)(:[0-9]+)?' + replacement: '${1}' + metric_relabel_configs: + - source_labels: ['cluster', 'topic'] + separator: '_' + target_label: chargecode + regex: '^Cluster1_important_topics$' + replacement: 'Chargecode1' + - source_labels: ['cluster', 'client_id'] + separator: '_' + target_label: chargecode + regex: '^Clister1_clientA_(.*)$' + replacement: 'Chargecode1' +``` + +Additionally, we will define a custom metric called “*broker_throughput_rate*” which calculates the rate of the bytes produced to Kafka using the Prometheus Rules. This will be used as the throughput metric in the “Resource Usage Based” Chargeback model. + +```yaml +groups: + ### BROKER THROUGHPUT RATE + - name: record_broker_throughput_rate + rules: + ## Throughput related units calculation - Bytes + - record: broker_throughput_rate + expr: rate(kafka_server_brokertopicmetrics_bytesinpersec[5m]) + labels: + env: "dev" + cluster: "Cluster1" +``` + +# Dashboards + +Simple and reusable Grafana dashboards are provisioned to view the usage and costs of different tenants in an organization. Let’s look at the dashboard for each of the Chargeback model. + +## Cost Centre + +The following metrics are highlighted in the Cost Center model dashboard. + +- Total cost for the Kafka cluster +- Current cluster capacity utilization +- Chargecode wise split of the cost + - In Amount + - In Percentage +- Top 5 Chargecodes based on usage, + - Throughput + - Partition + - Storage + +![Cost Centre1.png](../assets/blog-images/kafka_chargeback_blog/Cost_Centre1.png) + +![CostCentre2.png](../assets/blog-images/kafka_chargeback_blog/CostCentre2.png) + +## Resource Usage Based + +The following metrics are highlighted in the Resource Usage Based model dashboard. + +- Total cost for the Kafka cluster +- Current cluster capacity utilization +- Total Bill based on current usage +- Profit/Loss value (Bill - Cost) +- Chargecode wise split of the usage + - In Amount + - In Percentage +- Top 5 Chargecodes based on usage, + - Throughput + - Partition + - Storage +- Usage bill per Chargecode for each resource type + - Throughput + - Partition + - Storage +- Usage metrics per Chargecode over time for each resource type + - Throughput + - Partition + - Storage + +![ResourceUsage1.png](../assets/blog-images/kafka_chargeback_blog/ResourceUsage1.png) + +![ResourceUsage2.png](../assets/blog-images/kafka_chargeback_blog/ResourceUsage2.png) + +![ResourceUsage3.png](../assets/blog-images/kafka_chargeback_blog/ResourceUsage3.png) + +![ResourceUsage4.png](../assets/blog-images/kafka_chargeback_blog/ResourceUsage4.png) + +![ResourceUsage5.png](../assets/blog-images/kafka_chargeback_blog/ResourceUsage5.png) + +## Client Usage Based + +The following metrics are highlighted in the Client Usage Based model dashboard. + +- Total cost for the Kafka cluster +- Current cluster capacity utilization +- Total Bill based on current usage +- Profit/Loss value (Bill - Cost) +- Chargecode wise split based on client usage + - In Amount + - In Percentage +- Top 5 Clients per Chargecode based on usage, +- Usage bill per Chargecode based on each client type + - Producer + - Consumer +- Usage (throughput) metrics per Chargecode over time for each client type + - Producer + - Consumer + +![ClientUsage1.png](../assets/blog-images/kafka_chargeback_blog/ClientUsage1.png) + +![ClientUsage2.png](../assets/blog-images/kafka_chargeback_blog/ClientUsage2.png) + +![ClientUsage3.png](../assets/blog-images/kafka_chargeback_blog/ClientUsage3.png) + +# Conclusion + +In conclusion, implementing a Kafka chargeback solution is an effective way to attribute operational costs to various tenants in a multi-tenant Kafka environment. By leveraging the power of Kafka JMX metrics, Prometheus, and Grafana, platform teams can gain detailed insights into usage patterns and accurately distribute costs based on a variety of chargeback models. Not only does this encourage responsible usage of resources, but it also provides transparency and fairness in cost attribution. Looking ahead, we aim to refine and expand this solution, incorporating client side JMX metrics and introducing more visulaization features. diff --git a/_posts/2024-07-09-unified-access-layer-kafka-resources-part-1.md b/_posts/2024-07-09-unified-access-layer-kafka-resources-part-1.md new file mode 100644 index 0000000000..6c7d4b5371 --- /dev/null +++ b/_posts/2024-07-09-unified-access-layer-kafka-resources-part-1.md @@ -0,0 +1,433 @@ +--- +layout: post +title: "Unified Access Layer for Kafka Resources: Part 1" +authors: p6,raghav +categories: [Platform Engineering, Infrastructure, Kafka, Unified Access Layer] +image: assets/blog-images/unified_access_layer/unified_access_layer_1.png +featured: false +hidden: false +teaser: Unified access layer for Kafka resources, Part 1 +toc: true +--- + +# Introduction + +Managing Kafka clusters is all good as a platform team until you have a handful of teams and a handful of clusters. + +![managing-clusters-problem.png](../assets/blog-images/unified_access_layer/managing-clusters-problem.png) + +Let’s visit some common scenarios where every platform team has ended up with a mish-mash of ungoverned clusters or topics that eventually needs some garden cleaning (for good reason). + +- You need to move or migrate topics in the same cluster (surprise, Kafka can’t shrink partitions so you will have to create a new topic and replicate to mirror the old). You may need to move or migrate a topic (let’s say with hot, noisy neighbor partitions) to a different cluster altogether +- You may want to move a whole cluster to better infrastructure (or more commonly because you have a disaster!). +- You need to change the authentication mechanism and (re)distribute credentials. +- Your devs are complaining and need help optimizing their configurations (and they have no clue what they’re doing). + +If you’ve been there, done that and burnt your hands, you’re not alone. The part that makes this all pretty hard in the Kafka world is the nature of Kafka’s API, where clients do much of the song and dance. We have a proposal for a future proofing for all these concerns that includes: + +- The ability to route a client to a specific cluster (bootstrap.servers) dynamically, without having these hardcoded in the client + - ProTip: Very useful for entire cluster migrations +- The ability to route a client to a specific topic, based on a higher order messaging or routing primitive, without being joined in the hip with a set of topics + - ProTip: Very useful for topic migrations, but also routing multiple types of messages to the same topic +- Credential Mediation + - Giving clients their auth instruments, dynamically. Potentially references to secret stores or serving the credentials straight up (but ability to rotate on the server-side as well as revoking them on-demand) +- Configuration profile + - Ability to provide some of your well optimized for latency, throughput, durability, availability or otherwise just your distillation of best practices for producer, consumer, admin client configurations. + +Having some abstractions to achieve these cross cutting concerns is a good measure and will yield sanity to platform operators who may need to defer great control to themselves. In general, for large scale use-cases, it is always good to have some insulation and indirection in between kafka clients and the server, ala proxy patterns very popular in the HTTP/REST world. + +This post intends to propose common solutions (as a series) that would help overcome these problems and in the true spirit of a platform, provide an effective amount of self service. + +# Approach 1: Discover topics through Language specific SDKs and a rule-based routing layer + +This is a pattern where a language specific SDK handles the following concern, by talking to a metadata or a catalog service. + +Typically, the client here would, before instantiating a kafka client, locate the target kafka service using + +- A HTTP/REST call to the catalog service (using some form of security: Such as basic auth or OAuth) + - Express an intent (such as to produce or consume) on a certain topic or domain of topics (such as prefix pattern) OR by the virtue of it’s client ID, be routed to the appropriate cluster +- Receive the kafka bootstrap location + - Optionally, mediated credentials for both kafka and typically, schema registry +- The above, being cached with a TTL +- Return a Kafka client instance + +This approach has a wider-domain of applicability, beyond containerized environments, but it does require + +- A secured, metadata or catalog service (run by you) + - Which ideally has automation for inventory discovery across multiple clusters + - Combined with routing rules +- Language specific, idiomatic service locator implementation (“SDK”) + - And therefore code changes to existing applications including exception handling + +
    +flowchart LR + + subgraph PRODUCERS["Producers"] + P1("Producer 1") + P2("Producer 2") + P3("Producer 3") + end + subgraph CG1["Consumer Group 1"] + C1("Consumer 1") + end + subgraph CG2["Consumer Group 2"] + C2("Consumer 1") + C3("Consumer 2") + end + subgraph CG3["Consumer Group 3"] + C4("Consumer 1") + C5("Consumer 2") + end + + subgraph GW["Rule Based Routing Layer"] + direction LR + API["APIs (Kong)"] + US["Upstream (Catalog Service)"] + end + subgraph K1["Kafka Cluster 1"] + B1["Broker 1"] + B2["Broker 2"] + B3["Broker 3"] + end +subgraph K2["Kafka Cluster 2"] + B4["Broker 1"] + B5["Broker 2"] + B6["Broker 3"] + end + + + P1 & P2 & P3 -- 1.GetInfo --> GW -- 2.Response --> P1 & P2 & P3 + CG1 & CG2 & CG3 -- 1.GetInfo --> GW -- 2.Response --> CG1 & CG2 & CG3 + P1 -- 3.Produces --> K1 + P2 -- 3.Produces --> K2 + P3 -- 3.Produces --> CC + API --> US + K1 -- 4.Consumes --> CG1 + K2 -- 4.Consumes --> CG2 + CC -- 4.Consumes --> CG3 + + %% Styling + classDef kafkaStyle fill:#e0f7e9,stroke:#4caf50,stroke-width:2px; + class K1,K2 kafkaStyle; + + +
    + +We get into the description and implementation details here: [Unified Access Layer: Part 2: Rule Based Routing](https://platformatory.io/blog/unified-access-layer-part-2-rule-based-routing) + + +# Approach 2: Use a Service Mesh (for Kubernetes Environment) + +A service mesh (like Istio, Kuma) can be leveraged as a layer to access Kafka resources. It typically includes service discovery mechanisms that allow services to dynamically discover and communicate with each other, and this can be extended to Kafka by enabling services to discover Kafka brokers and other Kafka-related services (like schema registries or Kafka Connect connectors). + +***Understanding Service Mesh and Sidecar Pattern*** + +***Service Mesh***: A service mesh is an infrastructure layer that enables managed, observable, and secure communication between microservices. It typically includes features like traffic management, security, and observability. + +***Sidecar Pattern***: In a service mesh, the sidecar pattern involves deploying a helper container (sidecar) alongside each microservice container. This sidecar handles the networking responsibilities such as routing, proxying, and observing traffic. + +***Key Concepts***: + +***Local Sidecar Endpoint***: Each microservice communicates with its own local sidecar proxy rather than directly with other services or external endpoints. + +***Broker / Mesh Filters***: The sidecar uses rules and filters (configured in the service mesh) to manage traffic, including routing requests to the appropriate upstream services. + +***Central idea***: You produce to or consume from a local sidecar endpoint (which uses broker / mesh filters to route the request upstream). + +Let's dissect this. + +- You produce to or consume from a local sidecar endpoint: + - Produce: In Kafka terminology, producing refers to sending data (messages) to a Kafka topic. + - Consume: Consuming refers to retrieving data (messages) from a Kafka topic. + - Local Sidecar Endpoint: Instead of directly interacting with Kafka brokers, your application sends data to a local sidecar proxy endpoint. This proxy is running in the same pod as your application. + +- Which uses broker / mesh filters: + - Broker: This typically refers to the Kafka brokers that handle message storage and retrieval. + - Mesh Filters: These are configurations within the service mesh that dictate how traffic should be managed. Filters can include rules for load balancing, retries, timeouts, security policies, etc. + +- To route the request upstream: + - Upstream: In networking, upstream refers to the direction towards the service you are communicating with. Here, it means the Kafka brokers. + - The local sidecar proxy receives the request and, based on the configured filters and routing rules, forwards (or routes) the request to the appropriate Kafka broker. + + +Here's the overall architecture diagram for this idea. + +
    +flowchart TD + %% Kubernetes environment + subgraph K8S ["Kubernetes"] + direction TB + + %% Service Mesh (Istio) + subgraph Istio ["Service Mesh (Istio)"] + direction TB + + %% Producer Pods + subgraph P1Pod ["Pod: Producer 1"] + direction TB + P1("Producer 1") + P1Envoy["Envoy Proxy (P1)"] + Filter1["Mesh Filter (P1)"] + P1 -->|Produces To| P1Envoy --> Filter1 + end + + subgraph P2Pod ["Pod: Producer 2"] + direction TB + P2("Producer 2") + P2Envoy["Envoy Proxy (P2)"] + Filter2["Mesh Filter (P2)"] + P2 -->|Produces To| P2Envoy --> Filter2 + end + + %% Kafka Cluster 1 + subgraph K1 ["Kafka Cluster 1"] + direction TB + subgraph Broker1Pod1 ["Pod: Broker 1"] + direction TB + Broker1Envoy1["Envoy Proxy (Broker 1)"] + Broker1_1("Broker 1") + Broker1Envoy1 --> Broker1_1 + end + + subgraph Broker1Pod2 ["Pod: Broker 2"] + direction TB + Broker1Envoy2["Envoy Proxy (Broker 2)"] + Broker1_2("Broker 2") + Broker1Envoy2 --> Broker1_2 + end + + subgraph Broker1Pod3 ["Pod: Broker 3"] + direction TB + Broker1Envoy3["Envoy Proxy (Broker 3)"] + Broker1_3("Broker 3") + Broker1Envoy3 --> Broker1_3 + end + + Topic1["Topic 1"] + Topic2["Topic 2"] + end + + %% Kafka Cluster 2 + subgraph K2 ["Kafka Cluster 2"] + direction TB + subgraph Broker2Pod1 ["Pod: Broker 1"] + direction TB + Broker2Envoy1["Envoy Proxy (Broker 1)"] + Broker2_1("Broker 1") + Broker2Envoy1 --> Broker2_1 + end + + subgraph Broker2Pod2 ["Pod: Broker 2"] + direction TB + Broker2Envoy2["Envoy Proxy (Broker 2)"] + Broker2_2("Broker 2") + Broker2Envoy2 --> Broker2_2 + end + + subgraph Broker2Pod3 ["Pod: Broker 3"] + direction TB + Broker2Envoy3["Envoy Proxy (Broker 3)"] + Broker2_3("Broker 3") + Broker2Envoy3 --> Broker2_3 + end + + Topic3["Topic 3"] + Topic4["Topic 4"] + end + + %% Consumer Pods + subgraph C1Pod ["Pod: Consumer 1"] + direction TB + C1("Consumer 1") + C1Envoy["Envoy Proxy (C1)"] + Filter3["Mesh Filter (C1)"] + C1 -->|Consumes From| C1Envoy --> Filter3 + end + + subgraph C2Pod ["Pod: Consumer 2"] + direction TB + C2("Consumer 2") + C2Envoy["Envoy Proxy (C2)"] + Filter4["Mesh Filter (C2)"] + C2 -->|Consumes From| C2Envoy --> Filter4 + end + end + end + + %% Connections + Filter1 -->|Produces to| Topic1 & Topic2 + Filter2 -->|Produces to| Topic3 & Topic4 + Topic1 -->|Consumed by| Filter3 + Topic3 -->|Consumed by| Filter4 + Broker1Envoy1 -->|Manages| Topic1 & Topic2 + Broker1Envoy2 -->|Manages| Topic1 & Topic2 + Broker1Envoy3 -->|Manages| Topic1 & Topic2 + Broker2Envoy1 -->|Manages| Topic3 & Topic4 + Broker2Envoy2 -->|Manages| Topic3 & Topic4 + Broker2Envoy3 -->|Manages| Topic3 & Topic4 + + %% Styling + classDef kafkaStyle fill:#e0f7e9,stroke:#4caf50,stroke-width:2px; + class K1,K2 kafkaStyle; + +
    + +Downsides to using a service mesh: +- It is another piece of middleware that needs someone familiar with the internals (like Envoy proxy) to be able to operate it. +- Tenancy: the more tenants, the more valuable it is to operate. Careful planning is needed for policy, automation, tenancy and isolation. +- Being another piece of the services in the request path requires understanding on configuration, operation and integration within the organization. That with the governance between different teams. + +# Approach 3: Virtualize the Kafka Cluster and Topic (through a Kafka Gateway) + +In this approach, the gateway (e.g., [Conduktor](https://docs.conduktor.io/gateway/)) is a network proxy for Apache Kafka with capabilities that are extensible and customizable through plugins. + +Here is the architecture diagram to visualize the capabilities. + +
    +flowchart TD + subgraph PRODUCERS["Producers"] + P1("Producer 1") + P2("Producer 2") + end + subgraph CG1["Consumer Group 1"] + C1("Consumer 1") + end + subgraph CG2["Consumer Group 2"] + C2("Consumer 2") + C3("Consumer 3") + end + subgraph CONSUMERS["Consumers"] + CG1 + CG2 + end + subgraph GW["Kafka Gateway"] + direction LR + AA["Authentication & Authorization"] + PL["Plugins"] + end + subgraph K1["Kafka Cluster"] + B1["Broker 1"] + B2["Broker 2"] + B3["Broker 3"] + end + subgraph D1["Destinations"] + K1 + CC["Confluent Cloud"] + end + subgraph A1["Producers & Consumers"] + direction LR + PRODUCERS + CONSUMERS + end + P1 & P2 -- Produce --> GW -- Consume --> CG1 & CG2 + K1 & CC -- Messages --> GW + AA --> PL + GW -- Forward ---> K1 & CC + + classDef kafkaStyle fill:#e0f7e9,stroke:#4caf50,stroke-width:2px; + class A1,D1 kafkaStyle; +
    + +The gateway is deployed between the producers/consumers and the Kafka clusters and because it is Kafka protocol compliant, there are very small changes (like a new bootstrap server) needed for the clients to connect to the gateway. Typically, the gateway provides authentication via the support for security protocols just like the brokers (PLAINTEXT, SSL, SASL SSL, mTLS). The interesting component of the gateway is the Interceptors where we can write the business logic, especially routing the requests which is the central topic here. The gateway can also support processing flow (could be a pipeline of interceptors) where each component of the pipeline can be invoked sequentially during a produce request and the responses from the broker go through the pipeline in the reverse order. + +Some pros of the approach: +- being Kafka protocol compliant implies that they can support multiple vendors +- extensibility of the gateway could bring in operational insights into security, compliance, monitoring and alerting +- provides centralized management for one team to manage access and configuration + +Some downsides of the approach: +- they add latency depending on how they are implemented (Conduktor add latencies in the order of milliseconds) which can increase if encryption is enabled using a custom encryption key. Latency can increase if the processing flow pipeline has a number of plugins. +- proprietary tools could result in vendor lock-ins +- initial setup and configration could be complex + + +# Approach 4: Discover Topics using a Data Catalog + Self-Service Portal + +A data catalog allows organizations to managing and discovering data assets (in this case, Kafka clusters/streams, topics, schemas). Depending on the choice of the data catalog, it could support Kafka resource natively or have limited support. Ideally, you would need a data catalog that supports Kafka out-of-the-box. + +Here are a few options: +***Apache Atlas***: is the only Open Source solution that integrates well with Kafka, provides management and governance of data streams. It helps in Metadata management (tracks lineage and metadata for Kafka topics), Classification (allows classification and tagging of Kafka topics), Search and discovery (easy search and discovery of Kafka streams). + +***Confluent stream catalog***: supports Kafka resources (Schemas, Topics, Connectors, Clusters, Environments, Streaming data pipelines, Apache Flink® compute pools) on Confluent Cloud. It offers benefits such as improved metadata management and easier integration with external systems. + +***Other options***: GCP supports a catalog for Pub-Sub. AWS Glue Catalog unfortunately has limited support for streams (through the integration with Amazon MSK or Managed Service for Kafka). Azure supports Azure Catalog (similar features like AWS Glue Catalog). Collibra has some limited support for Kafka (apart from Metadata management and governance, it supports data quality checks for streaming data). We can also look at OpenMetadata, Databricks unity catalog and Amundsen among others. For everything else, there’s the Hive metastore. + +Here's the architecture diagram for the idea: +
    +flowchart TD + + subgraph PRODUCERS["Producers"] + P1("Producer 1") + P2("Producer 2") + end + subgraph CG1["Consumer Group 1"] + C1("Consumer 1") + end + subgraph CG2["Consumer Group 2"] + C2("Consumer 2") + C3("Consumer 3") + end + subgraph CONSUMERS["Consumers"] + CG1 + CG2 + end + subgraph SSP["Self-Service Portal"] + PO1["Portal"] + end + subgraph DC["Data Catalog for Kafka"] + direction LR + API["APIs"] + MM["Metadata Management & Lineage"] + TL["Classification & Tagging"] + DG["Data Governance"] + DQ["Data Quality"] + end + subgraph SSPDC["Self-Service Portal & Data Catalog"] + direction TB + SSP + DC + end + subgraph KC["Kafka Cluster"] + B1["Broker 1"] + B2["Broker 2"] + B3["Broker 3"] + end +subgraph KC1["Kafka Cluster"] + B4["Broker 1"] + B5["Broker 2"] + B6["Broker 3"] + end + subgraph D1["Destinations"] + direction TB + KC + KC1 + CC["Cloud Provider (Confluent, AWS MSK, Azure, GCP)"] + + end + subgraph A1["Producers & Consumers"] + direction LR + PRODUCERS + CONSUMERS + + end + P1 & P2 -- Produce (with Metadata) --> DC -- Consume (with Metadata) --> CG1 & CG2 + KC & KC1 & CC -- Messages (with Metadata) --> DC + DC -- Forward (with Metadta) ---> KC & KC1 & CC + SSP -- Configures --> DC + + + classDef kafkaStyle fill:#e0f7e9,stroke:#4caf50,stroke-width:2px; + class D1,SSPDC,A1 kafkaStyle; + + +
    + +These data catalogs offer APIs that allow organizations to seamlessly integrate the data catalog with other systems, automate metadata management tasks, and build custom applications on top of the catalog’s capabilities. This flexibility is crucial for maintaining an efficient and scalable data management ecosystem. + +All of the above broadly captures business and technical metadata. You still need a discovery interface to query by these parameters. Ideally, you also need a self-service portal for credential vending. + +Data catalog also comes with potential downsides related to complexity, performance, consistency, vendor lock-in, maintenance, and learning curve. Careful evaluation of the factors is essential to determine if the use of a catalog is suitable for your specific streaming application and organizational needs. + +# Conclusion + +Creating a system that helps a centralized Kafka infrastructure team to easily create, label and vend information reduces common problems and dependencies. Same benefits are passed on to the producers and consumers thus creating a scalable system/organization. diff --git a/_posts/2024-07-10-a-leaderless-future-kafka-streaming.md b/_posts/2024-07-10-a-leaderless-future-kafka-streaming.md new file mode 100644 index 0000000000..80b25c437e --- /dev/null +++ b/_posts/2024-07-10-a-leaderless-future-kafka-streaming.md @@ -0,0 +1,132 @@ +--- +layout: post +title: "A leaderless future: kafka-esque story on what is becoming mainstream" +author: p6 +categories: [ Platform Engineering, Data, Infrastructure, Kafka, Real-time ] +image: assets/blog-images/kafka-leaderless-future/kafka-dystopian.webp +featured: false +hidden: false +teaser: kafka-esque story on what is becoming mainstream +toc: true +--- + + +As of 2024, Apache Kafka is almost a teenager. A project & community that has been nothing short of pivotal and truly deserving of the category creator hall of fame. However, the teenage years are a little crazy. + + +# The fundamentals are still strong, but Apache Kafka ~~is almost legacy~~ is a little behind + +But let's state it right at the onset: Building still in the (mostly) on-premise era, The founders of Kafka were true visionaries who got a lot right straight from the onset. Building on JVM (for better or worse), persistence as a first class concern, a design focused on commodity HDDs optimized for sequential I/O, zero copy optimizations, all on top of a relatively minimalistic wire protocol and tons of bolt-ons organically evolved over the years. Till date, every KIP still conforms to a spirit of performance focus, a design ethos not in question. + +Justifiably, Kafka has emerged as the de-facto mechanism for data exchange and an API that has become the gold standard beyond its own, self-branded implementation. It is now supported by an increasing mushroom cloud of technologies, both OSS and commercial, tackling the same class of problems Kafka originally set out to. Although for the most part the whole USP is about gaining an “install base” audience, touting “compatibility and drop-in replacement”. It is a classic market focused adoption play, but at any rate, standardizing on someone else’s API is still the deepest form of flattery you can get in tech. + +As of 2024, nearly hyperscaler provides a Kafka (or compatible) service and there are now, prospectively best in breed Kafka-esque platforms emerging beyond the Apache Fn project + +Everything is a product of its times and Kafka is no different. While the community still keeps up (which we shall discuss later on), it only does so with much baggage and design constraints that it was born with, some of which are just structural in nature. + + +# Some things are still less right and more wrong + +Over the years, there has been a greater convergence of what were kafka’s deficits, but it also falls into the zone (in 20:20 hindsight) of trends that green-field, cloud native b(l)oomer projects have been beneficiaries of. + +Let’s look at a few dimensions of differentiation: + + + +1. A simple(r) distribution: aka, a single binary. No Zookeeper (or a better Zookeeper) +2. Squeezing the last drop of performance: aka, getting rid of the JVM (and it’s concurrency model) +3. A modern approach to storage: aka separating storage from compute (and more loosely, tiered storage) but also SSDs. And the radical new idea of completely doing away with disks. +4. Operator Experience: aka, something better than the clunky kafka run class tools +5. A longer exhaustive list: Multi-tenancy, 2PC, Adaptive Partitioning, Proxies (and more) + +For example, Pulsar was the first to objectively address (1) and parts of (5). But it did introduce additional complexity with Apache Bookkeeper. JVM has been a perennial bottleneck and therefore a C++ based rewrite like Redpanda, armed with a Seastar core and a storage layer for modern SSDs, beats Kafka benchmarks on all of throughput, latency and node count relatively easily in most scenarios. WapStream and AutoMQ take the storage debate to the holy grail of cloud native architectures - ie, to S3. Confluent on its own part has built several enterprise features, including tiered storage, but also what is probably the most robust DevX and OperatorX around Kafka. + +Obviously these are available in self hosted and BYOC variations. Not to speak of the cloud-first offerings, such as Confluent Cloud itself, but also the comparable (atleast Kafka for kafka terms) AWS MSK serverless, Azure Event Hubs and most recently, Google entering the Kafka market. + +Infact, the distinction is so stark that the marketing pitch from nearly everyone involves some flavor of 10X differentiation from Apache Kafka, including Confluent, the proverbial corporate steward. + +The truth is that the Apache Fn project (and together with Confluent which employes a number of committers) has addressed most of these challenges, but it has largely been as a follower. The basic fact is that open source now lags behind. + + +# Streaming is still a low latency niche + +Streaming systems are fundamentally designed to optimize for incremental processing and lower latency. While there's no doubt about the value of real-time data, acting upon data in motion requires a much more comprehensive focus, a still volatile and maturing stack and many inherent complexities (such as time domain, stateful processing, consistency) which most stakeholders used to batch processing simply don't comprehend. + +When asked to tradeoff latency against the costs and complexity of achieving it, we have seen that most businesses tend to make it an easy decision. They can live with a few more seconds (sometimes hours) of latency. + + +# TCO is shaping the future of Kafka + +We talked previously about the mushrooming of the _de-novo Kafkaesque_. While you would expect this market to be all about the hype of real-time and the art of the possible, ground reality is that sales and adoption conversations are largely led by TCO. + +TCO is a very subjective and opaque metric that needs quantification of the customer's operating capability, personnel costs and most importantly tangible opportunity cost. In most cases, this is not an honest conversation between sellers and enterprise buyers. + +This leads to very contrived forms of differentiation and commercial discounting from vendors of a “must win” nature, relationship plays, package deals, FUD about competition, and often, plain bs. + +At the end of the day, it is pretty simple. In private cloud, on-premise and BYOC environments it reduces to these factors: + + + +* number of node or core licenses required (lower is better, relative to present state, realistic projections and head-room for growth) +* architectures that are AZ-local while maintaining low RTO/RPO. + +With cloud (and specifically serverless), it boils down to: + + + + + +* throughputs (in other words ingress/egress) +* concurrency, +* partition counts +* storage + +In a world where everyone prefers serverless options on the cloud and a desire to pushdown durability + avoiding cross-AZ surprise billing (down to the cloud provider), the race to the bottom for data infra at the lowest cost will always be won by the holy grail of all data foundations, which is, wait for it..AWS S3. Or more generally, cloud object storage. But we emphasize S3 because AWS is a first mover for all sorts of things on the cloud and S3 Express 1Z is defining a new frontier by providing consistently sub-10ms latencies directly at the bedrock of durable storage. This would have sounded provocative many years ago, but in 2023-24, in very much the footsteps of every other , a Kafka API on top of S3 directly was just something waiting to be built. + +This is exactly what WarpStream is doing and although there will be fast followers (sometimes pioneers who grudgingly claim independent discovery of the idea). First movement and execution matters. Acknowledging good competition is good. + +The writing on the wall is pretty clear. Kafka on the cloud is way more expensive than it needs to be. A Kafka protocol on top of object storage will satisfy the pareto principle for most workloads in a TCO defense. Latency will only get better in times to come. Soon enough, such clusters will be a commodity. Curiously, in this implementation, there are no leaders, replicas or ISR. It definitely foreshadows what the Kafka platform landscape is headed to: a leaderless future. + + +# Dumb pipes are a commodity. The value is up the chain + +So far we have spoken about competitive dynamics. But the surface area of Kafka is actually pretty minimalistic. It mostly provides pub-sub plumbing for a distributed world. Shock absorbers between systems. Big buffers and flood controls. Fire Hoses. Fundamentally pipes though. Dumb pipes at that. + +However, with good dumb pipes, you can still build a whole ecosystem of tooling. And that's really where the value lies. Connectors, Stream processing, streaming databases, real-time features, real-time databases, event-driven architecture, ML/AI even. + +The sum total of all these components and coverage of various cross cutting concerns forms the streaming platform. A platform which you can build multi modal data products on. + + +# Instant Data Lakehouse. Far from there + +One of the architectural advantages of streams as a foundational data primitive, combined with the trends we have discussed, of tiered storage and no disks (object storage) is an interesting innovation in fully embracing Kappa architecture. Continuous streams that hydrate lakes, warehouses and everything in between, but without the need of additional plumbing. + +The side effects of offloading storage in full part to S3 is that your Kafka log segments can be magically turned into open tables which query engines, warehouses and lakehouses can directly talk to. Or perhaps even more appealing are the likes of DuckDB and a whole ecosystem of an up & coming, headless, composable data stack that layers on top of it. + +This has profound implications on the cost and economics of data architectures. However, be cautioned: It is still early days and this is largely architecture astronomy with many feature gaps. Batch and streaming do have pretty stark context boundaries and limited shared vocabulary or even good tooling. Even Flink is certainly not there yet. Spark and Spark streaming is fundamentally still a microbatch primitive under the hood. Projects like Beam have a vision but are fundamentally limited by runners they integrate with. Over the course of time, one would expect some truly unified experience and consistent APIs. Perhaps with the advent of Arrow, Ibis Data and the like. + + +# Convergence and Divergence + +Protocols and APIs always outlive platforms. Kakfa certainly enjoys the protocol privilege as it stands. However, in any market that leads to a commodity, it is exceedingly difficult for vendors to keep baselining themselves against an API of another project they don't control. + +While open standards such as AsyncAPI and CloudEvents are extant, their vision is pretty narrow and purpose specific to interoperability, rather than as a foundation to streaming. + +The natural consequence is that while everything in the short term is still converging to Kafka, it is likely that in the long term, market forces will propel vendors to differentiate. + +> For the curious and nerdy, we will have a series to publish about the internals of NATS and Kinesis to begin with. + +> If you are a Kafka user and interested in our technical whitepaper about the Kafka landscape, register your interest here and we will share a copy. It also contains a holistic analysis of the real-time data stack and SWOT plots for Confluent, Redpanda, WarpStreamMSK, Azure Event Hubs and GCP Kafka for BigQuery + + + + + + + + + + + + + diff --git a/_posts/2024-07-11-primer-on-streaming-databases.md b/_posts/2024-07-11-primer-on-streaming-databases.md new file mode 100644 index 0000000000..15766cbe5d --- /dev/null +++ b/_posts/2024-07-11-primer-on-streaming-databases.md @@ -0,0 +1,134 @@ +--- +layout: post +title: "A Primer on Streaming Databases" +authors: avinash +categories: [Data, Kafka, Streaming Databases, KSQL, Materialize, RisingWave] +image: assets/blog-images/streaming_databases/StreamingDatabase.png +featured: false +teaser: Why streaming databases are the rage and what makes them the hottest thing in the world of streaming. A deep dive into important characteristics of a few streaming DBs with a comparative lens. +--- + + +# Stream Processing + +In the modern era of digitalization, data needs to be made available for making important business decisions and insights in “real-time”. The definition of “real-time” varies with use-cases and is measured by the latency from the source to the destination. For example, use-cases such as High Frequency Trading (HFT) require sub microsecond latency and are not attainable through any software-based stream processing system but through FPGAs. Further down the spectrum, we have another hard real-time use case of self-driving cars and industrial automation, which can tolerate a few milliseconds of latency. On the other end, we have the traditional batch processing, which can vary from minutes to hours to even days. + + +![real_time_processing.png](../assets/blog-images/streaming_databases/real_time_spectrum.png) + + + +### Challenges with Stream Processing + +For the rest of the use cases, we can rely on a stream processing system that processes data in “real-time”. For such real-time processing systems, there are constraints within the system, such as constraints on how quickly the data can be made available for processing from various other systems. It is usually the weakest link in the chain that contributes to the constraints. This makes stream processing in “real-time” a hard problem to solve. + +The challenge with stream processing is that stream processing operations are actually defined on the future state of data, i.e., the system needs to be aware of how to process data that arrives in the future. This is different from conventional databases, even data warehouses, or the general landscape of OLAP systems, where the computation is done on data that already exists. + +Another major challenge with stream processing is that the data is unbounded and unpredictable. As with any distributed system, nothing is trustworthy, especially the network. Therefore, unbounded data also means that the data can arrive late or not in order. The stream processing system should be able to handle these challenges and process data while providing a correctness and consistency guarantee. + + +## Stream Processing Frameworks + +There are many systems built to solve the challenges of stream processing in “real-time”. A stream processing engine is capable of processing data as it arrives and handling data that arrives in the future. The processing typically involves filtering, transforming, joining, and aggregating one or more streams of data. Some of the popular frameworks for stream processing are Apache Storm, Apache Samza, Apache Ignite, Apache Flink, Kafka Streams, and Apache Spark Streaming. Each of these frameworks or engines provides unique ways of processing data, with various pros and cons. Among these, Apache Flink is a popular project that can handle both streaming and batch processing. Such stream processing engines provide an interface in a programming language such as Java or Python and typically allow performing low-level computations on streaming data. + + +### SQL: Universal language for data + +Stream processing engines that provide low-level APIs are great for use cases that require either direct access to the state store or processing data at a granular level. However, a lot of use cases can be implemented without writing a lot of boilerplate code and building bloated code bases. SQL has emerged as a solution to this problem, with some of the stream processing engines, such as Apache Flink and Apache Spark Streaming, supporting it. + +SQL has stood the test of time and is a query language familiar to most data engineers. It solves the developer experience problem for stream processing and allows expressing the computation to be performed in a simple DSL that is understood not just by data engineers but also by data scientists or statisticians. This allows expressing the processing operations as SQL like statements instead of functional code. + +![FunctionsToSQL.png](../assets/blog-images/streaming_databases/FunctionsToSQL.png) + + +# Streaming Databases + +The use of SQL for stream processing led to the emergence of a new category of stream processing engines called streaming databases. Streaming databases provide an interface for stream processing while abstracting some of the complexities involved in running a distributed system for stream processing. KsqlDB was one of the earliest projects in the space and provided an SQL interface on top of Kafka and Kafka Streams. RisingWave, Materialize, and DeltaStream are some of the leading and emerging projects. + + +## Streaming Databases vs Stream Processing vs Real Time OLAP + +Although both a stream processing engine and a streaming database serve the same purpose, they are different by definition. A stream processing engine uses a local state, which is distributed across all workers that the framework uses. These state stores might be queryable as well. Whereas a streaming database has storage as a first class concern and decouples the storage and compute. In the case of a stream processing framework, the data flow is in the control of the person writing the job. Streaming databases hide the stream processing from the user and have query planners, optimizers that do the heavy lifting for the data flow. Streaming databases are more declarative, while stream processing frameworks are typically imperative. + +Real time analytical databases such as Apache Druid, Apache Pinot, or Clickhouse, on the other hand, are meant for querying data in real-time post ingestion. These databases handle ad-hoc and user facing analytics. Whereas a streaming database is meant to work on data before it is ingested into a RTOLAP database for querying. + + +# Comparing Streaming Databases + +While evaluating a streaming database, asking a few questions about the features, design, and capabilities can help. + + +### Sources and Sinks + +One of the major considerations is regarding the supported sources and sinks of data. Does it have direct support for Postgres, Oracle CDC, or Change Data Capture from various other platforms out of the box? This allows saving hope and maintaining an additional component, such as Debezium, for the CDC. Another important source/sink is Kafka. Since Kafka has emerged as the de-facto protocol for streaming, it is important that the streaming database support the Kafka protocol for source and sink. + + +### SQL + +One of the primary reasons to adopt a streaming database is the developer experience and the abstraction that it provides over a stream processing framework. A key aspect is the dialect of SQL that is supported. For example, many streaming databases are Postgres wire protocol compliant, which helps developers familiar with Postgres adopt it with ease. Typical use-cases in stream processing involve enriching data; hence, it is important to look at the JOINs supported. Finally, the streaming database should have support for various schema formats, such as Avro, JSON, Protobuf, etc. + + +### Consistency Model + +A key concern with stream processing is the correctness of the data, which implies the streaming database should support transactions or atleast atomicity of the data. In any distributed system, data might arrive later than intended, and the system should be capable of handling such late-arriving data. An important consideration is the types of windows supported for joins and aggregations. Watermarking has traditionally been a concept in the realm of stream processing frameworks, but there have been recent improvements that bring concepts like watermarking into the world of streaming databases. Support for watermarking is another consideration for evaluation. + + +### Extensibility + +Even if the streaming database does not support all transformations or operations desired, it should allow the user to extend the capabilities of the system using a User Defined Function (UDF). For UDFs, the languages supported and the level of extensibility are key factors for evaluation. + + +### Fault Tolerance + +An important aspect of any distributed system dealing with mission critical data is the ability to recover from failure without losing data. One should understand the state management and the Recovery Point Objective (RPO) / Recovery Time Objective (RTO) that the streaming database supports. + + +### Maturity + +When evaluating any software, the maturity and age of the software is a basic criteria for evaluation. Another relevant criteria is the adoption of the streaming database. If the streaming database is adopted by companies that are running it at scale with a proven record, it adds to the credibility of the database. Real benchmarks from customers help understand the performance of the system better. + + +### Architecture + +Finally, understanding the implementation details helps in designing and troubleshooting the stream processing better. An aspect of evaluation is understanding the data flow model, the key primitives, and user facing APIs / constructs. + + +# Comparison of 3 OSS streaming databases + +For this blog post, we will be providing a comparison between three open source databases - KsqlDB, RisingWave, and Materialize. We will briefly compare them across the previously mentioned factors and provide a reference for evaluation. + + +### KsqlDB + +KsqlDB provides an SQL DSL on top of Kafka Streams, and therefore it inherits both the advantages and disadvantages of Kafka Streams. KsqlDB translates the SQL query and plans the processor topology before executing a Kafka Streams application behind the scenes. KSQL has two processing primitives - Streams and Tables. Tables are essentially a compacted stream. KSQL is not ANSI SQL complaint, hence, the syntax is custom. KsqlDB does not support transactions. + +KsqlDB integrates out-of-the-box with [Confluent Schema Registry](https://docs.confluent.io/platform/current/schema-registry/index.html), supporting JSON, Avro, JSON Schema, and Protobuf formats. KSQL supports Stream-Stream, Stream-Table joins, and in-general, n-way joins. However, there are certain [limitations](https://github.com/confluentinc/ksql/issues?q=is%3Aissue+is%3Aopen+join+label%3Aenhancement) to how joins can be done. In terms of windows, KSQL supports Tumbling, Hopping, and Session windows. There is no concept of watermarking in ksqlDB, however, there is a concept of grace periods. If the late arriving data does not arrive within a specific upper bound, then the window is closed. + +For the state store, ksqlDB uses RocksDB, and the state store is backed up through a changelog topic. There is no snapshotting or checkpointing of data. In the event of a disaster, ksqlDB can replay the changelog topic and reconstruct the state. This is a time consuming process, and hence the RTO is always > 0. + +KsqlDB supports Kafka for source and sink. However, it also allows configuring Kafka connectors using SQL, which implies that all the Kafka connectors can be considered a source and sink for ksqlDB. KsqlDB allows writing custom processing logic using Java-based UDFs. KsqlDB is backed by Confluent and is open source with a [Confluent Community License](https://github.com/confluentinc/ksql/blob/master/LICENSE-ConfluentCommunity). + + +### Materialize + +Materialize is based on the TimelyFlow data model and written in Rust. The processing is based on Materialized Views, which are like an aggregation of streams. Materialize also has indexes, which are like tables used for enrichment. The SQL used in Materialize is fully wire compatible with Postgres. Materialize supports transactions as well. + +Materialize supports JSON, Avro, and Protobuf formats but does not support JSON Schema yet. Materialize supports all forms of joins, including varieties such as lateral joins or cartesian cross joins. For windowing, sliding and TTL windows are supported. + +Materialize is based on the timely differential data flow model, which uses the concept of arrangements to store the state. However, these are stored in memory, and the fault tolerance for these is unclear. Materialize ensures consistency using a concept called virtual timestamps and provides [consistency guarantees](https://materialize.com/docs/get-started/isolation-level/) for input, internal, and output. + +Materialize supports Kafka, Postgres, MySQL, and webhooks for sources and Kafka for sinking. There is no support for UDFs, however, there is extensive support for SQL functions. Materialize is maintained by a company with the same name and is under a Business Source License (BSL). + + +### RisingWave + +RisingWave is also written in Rust and has Materialized Views and Indexes as the primary processing primitives. RisingWave is fully wire compatible with Postgres and supports transactions. RisingWave supports transactions as well. + +RisingWave supports Avro, JSON, and Protobuf, along with a few dialects of JSON such as Canal JSON and Maxwell JSON. In terms of joins, RisingWave supports inner and outer joins, with Tumbling and Hopping windows supported. + +The state store in RisingWave is a custom state store, which is not quite RocksDB and is called Hammock. Hammock uses S3 for storage and supports checkpointing for fault tolerance. RisingWave adopts the Chandy–Lamport algorithm to create checkpoints. RisingWave supports watermarking, similar to Apache Flink. + +RisingWave supports Kafka, Pulsar, S3, Kinesis, Google Pub Sub, and Postgres/MySQL CDC for sources and Kafka, MySQL, Postgres, Kinesis, Iceberg, and Delta Lake for sinking. UDFs can be written in Python for extensibility. RisingWave is maintained by a company with the same name and is open source under an Apache 2.0 license. + + +![streaming_dbs_comparison.png](../assets/blog-images/streaming_databases/streaming_dbs_comparison.png) \ No newline at end of file diff --git a/_posts/2024-07-13-unified-access-layer-part-2-rule-based-routing.md b/_posts/2024-07-13-unified-access-layer-part-2-rule-based-routing.md new file mode 100644 index 0000000000..263ea246b2 --- /dev/null +++ b/_posts/2024-07-13-unified-access-layer-part-2-rule-based-routing.md @@ -0,0 +1,556 @@ +--- +layout: post +title: "Unified access layer for Kafka resources: Part 2: Rule Based Routing Layer" +authors: p6,raghav +categories: [Platform Engineering, Infrastructure, Kafka, Unified Access Layer] +image: assets/blog-images/unified_access_layer/rule_based_routing_layer.png +featured: false +hidden: false +teaser: Unified access layer for Kafka resources, Part 2, Rule Based Routing Layer +toc: true +--- + +# Introduction + +This is the Part 2 which is a follow up of [Part 1](https://platformatory.io/blog/unified-access-layer-kafka-resources-part-1) of the Unified Access Layer for Kafka Resources where we dive deep into the design and implementation of discovering Kafka resources via a rule based routing layer. + +# Approach 1: Discover topics through a rule based routing layer + +Just to recap, here we are trying to build the pattern where a language specific SDK handles the Kafka resources discovery by talking to a catalog service. + +Let’s refer the overall architecture diagram: +
    +flowchart LR + + subgraph PRODUCERS["Producers"] + P1("Producer 1") + P2("Producer 2") + P3("Producer 3") + end + subgraph CG1["Consumer Group 1"] + C1("Consumer 1") + end + subgraph CG2["Consumer Group 2"] + C2("Consumer 1") + C3("Consumer 2") + end + subgraph CG3["Consumer Group 3"] + C4("Consumer 1") + C5("Consumer 2") + end + + subgraph GW["Rule Based Routing Layer"] + direction LR + API["APIs (Kong)"] + US["Upstream (Catalog Service)"] + end + subgraph K1["Kafka Cluster 1"] + B1["Broker 1"] + B2["Broker 2"] + B3["Broker 3"] + end +subgraph K2["Kafka Cluster 2"] + B4["Broker 1"] + B5["Broker 2"] + B6["Broker 3"] + end + + + P1 & P2 & P3 -- 1.GetInfo --> GW -- 2.Response --> P1 & P2 & P3 + CG1 & CG2 & CG3 -- 1.GetInfo --> GW -- 2.Response --> CG1 & CG2 & CG3 + P1 -- 3.Produces --> K1 + P2 -- 3.Produces --> K2 + P3 -- 3.Produces --> CC + API --> US + K1 -- 4.Consumes --> CG1 + K2 -- 4.Consumes --> CG2 + CC -- 4.Consumes --> CG3 + + %% Styling + classDef kafkaStyle fill:#e0f7e9,stroke:#4caf50,stroke-width:2px; + class K1,K2 kafkaStyle; +
    + +## Implementation + +### Catalog Service + +This represents the catalog of all the Kafka resources that are needed by Clients (Producers and Consumers). This is implemented in the Kong API Gateway with +- A service that represents the Catalog Service upstream +- A route “/kafka-service-gw” on the service +- A custom plugin (or a pre-functions/post-functions plugin) that has a set of rules configured to retrieve the right Kafka bootstrap servers and topic information based on a set of query parameters (Channel, ServiceType, Organization). Clients can be either 1 call (ideally) or multiple calls to retrieve the information. +- Auth: all these calls are protected using Basic Auth + +The code can be found [here](https://github.com/Platformatory/kong-service-gateway) + +### Java Implementation for Kafka Producer and Consumer + +### Kafka Producer +Like we highlighted above, we need a custom consumer that can call the Catalog Service passing in the required information to locate the bootstrap servers and the topic. These will then be used to produce records (instead of the static configuration of the bootstrap servers and the topic). Here, we create a new ServiceLocatorProduder which is a facade around the KafkaProduder (routes all the calls to the internal Kafka Producer object). It first calls the Catalog Service passing the information needed and obtains the bootstrap servers and the kafka topic. + +As you can see below, any configuration change during the `send` will trigger an automatic refresh of the bootstrap servers and the topic. After this, everything will work seamlessly as you can see the execution. + +```java +class BasicInterceptor implements Interceptor { + String credentials; + BasicInterceptor(String id, String password) { + credentials = Credentials.basic(id, password); + } + + @NotNull + @Override + public Response intercept(@NotNull Chain chain) throws IOException { + Request request = chain.request(); + Request.Builder builder = request.newBuilder().header("Authorization", credentials); + return chain.proceed(builder.build()); + } +} +public class ServiceLocatorProducer implements Producer { + private OkHttpClient client = new OkHttpClient.Builder().build(); + private LoadingCache> cache; + public static final String CACHE_KEY = "service-map"; + public static final String KAFKA_TOPIC_KEY = "kafka_topic"; + public static final String SERVICE_LOCATOR_BASE_URL = "http://localhost:8000/kafka-service-gw/"; + private Properties properties; + + private KafkaProducer kafkaProducer; + + private void initCache() throws ExecutionException { + cache = CacheBuilder.newBuilder() + .maximumSize(10) + .expireAfterWrite(5, TimeUnit.SECONDS) + .build( + new CacheLoader>() { + public Map load(String id) throws IOException { + final Map svcMap = getServiceConfiguration(); + return svcMap; + } + } + ); + } + + private void createProducer(Properties properties) throws ExecutionException { + properties.put(BOOTSTRAP_SERVERS_CONFIG, cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG)); + + System.out.println(cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG)); + + this.properties = properties; + + kafkaProducer = new KafkaProducer(properties); + } + + ServiceLocatorProducer(Properties properties) throws ExecutionException { + initCache(); + + createProducer(properties); + } + + String getTopic() throws ExecutionException { + return cache.get(CACHE_KEY).get(KAFKA_TOPIC_KEY); + } + + public Map getServiceConfiguration() throws IOException { + OkHttpClient client = new OkHttpClient.Builder() + .callTimeout(5, TimeUnit.MINUTES) + .connectTimeout(5, TimeUnit.MINUTES) + .addInterceptor(new BasicInterceptor("user1", "password1")) + .build(); + + Map kafkaSvcLocMap = new HashMap<>(); + + String topic = getTopicConfiguraion(client); + kafkaSvcLocMap.put(KAFKA_TOPIC_KEY, topic); + + String bootstrapServersConfig = getBootstrapServersConfig(client); + + kafkaSvcLocMap.put(BOOTSTRAP_SERVERS_CONFIG, bootstrapServersConfig); + + return kafkaSvcLocMap; + } + + @NotNull + private String getBootstrapServersConfig(OkHttpClient client) throws IOException { + Request request = new Request.Builder() + .url(SERVICE_LOCATOR_BASE_URL + "kafka_clusters?domain=example.org") + .build(); + + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) throw new IOException("Unexpected code " + response); + + JsonObject jsonObject = JsonParser.parseString(response.body().string()).getAsJsonObject(); + System.out.println("bootstrap:" + jsonObject.get("bootstrap_servers").getAsString()); + return jsonObject.get("bootstrap_servers").getAsString(); + } + + @NotNull + private String getTopicConfiguraion(OkHttpClient client) throws IOException { + Request request = new Request.Builder() + .url(SERVICE_LOCATOR_BASE_URL + "channels?channel_name=channel1") + .build(); + + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) throw new IOException("Unexpected code " + response); + + JsonObject jsonObject = JsonParser.parseString(response.body().string()).getAsJsonObject(); + System.out.println(jsonObject.get("resolved_value").getAsString()); + JsonObject innerObject = JsonParser.parseString(jsonObject.get("resolved_value").getAsString()).getAsJsonObject(); + System.out.println(innerObject.get("topic1")); + return innerObject.get("topic1").getAsString(); + } + + @Override + public Future send(ProducerRecord producerRecord) { + return send(producerRecord, null); + } + + @Override + public Future send(ProducerRecord producerRecord, Callback callback) { + String bootstrapServers = null; + String topic = null; + try { + bootstrapServers = cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG); + topic = cache.get(CACHE_KEY).get(KAFKA_TOPIC_KEY); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + + if (this.properties.get(BOOTSTRAP_SERVERS_CONFIG).equals(bootstrapServers) && + producerRecord.topic().equals(topic)) { + return kafkaProducer.send(producerRecord, callback); + } else { + System.out.printf( + "Need to update bootstrap servers config to %s and the topic config to %s and create a new Producer\n", bootstrapServers, topic); + this.close(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + kafkaProducer = new KafkaProducer(properties); + ProducerRecord newRecord = new ProducerRecord<>(topic, producerRecord.key(), producerRecord.value()); + return kafkaProducer.send(newRecord, callback); + } + } +``` + +### Kafka Producer App + +Typically, the producer application would include all the properties needed to successfully publish records to the Kafka cluster (especially the 2 key things: 1/ Bootstrap servers, 2/ Topic). Like shown below, the ServiceLocatorProducer does the magic to get the right bootstrap servers and topic (instead of the commented out static information). + +```java +public class App { + + public static void main(String[] args ) throws Exception, ExecutionException, InterruptedException { + + Properties config = new Properties(); + + // Not adding the bootstrap.servers config because it will be retrieved automatically + config.put(ACKS_CONFIG, "all"); + config.put(KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getCanonicalName()); + config.put(VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getCanonicalName()); + + String [] keys = {"joe", "jill", "justie"}; + String [] values = {"crease", "myers", "hill"}; + + try (final ServiceLocatorProducer producer = new ServiceLocatorProducer<>(config)) { + final Random rnd = new Random(); + final int numMessages = 10000; + for (int i = 0; i < numMessages; i++) { + String user = keys[rnd.nextInt(keys.length)]; + String item = values[rnd.nextInt(values.length)]; + + // topic is obtained automatically from the producer and updated if it is different + // when we receive the topic from the RecordMetadata received from the send method + AtomicReference topic = new AtomicReference<>(producer.getTopic()); + + producer.send( + new ProducerRecord<>(topic.get(), user, item), + (event, ex) -> { + if (ex != null) + ex.printStackTrace(); + else { + System.out.printf("Produced event to topic %s: key = %-10s value = %s%n", topic, user, item); + if (!topic.get().equals(event.topic())) { + topic.set(event.topic()); + } + } + }); + // Only to demonstrate the change in configurations + Thread.sleep(100); + } + + System.out.printf("%s events were produced to topic %s%n", numMessages, producer.getTopic()); + + } + } +} + +``` + +### Kafka Producer in Action +Just showing the excerpts from the full execution here. + +``` +C:\Users\nragh\.jdks\corretto-11.0.22\bin\java.exe "-javaagent:C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2023.2.2\lib\idea_rt.jar=55727:C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2023.2.2\bin" -Dfile.encoding=UTF-8 -classpath C:\Users\nragh\IdeaProjects\kafka-producer\target\classes;C:\Users\nragh\.m2\repository\org\apache\kafka\kafka-clients\7.0.1-ccs\kafka-clients-7.0.1-ccs.jar;C:\Users\nragh\.m2\repository\com\github\luben\zstd-jni\1.5.0-2\zstd-jni-1.5.0-2.jar;C:\Users\nragh\.m2\repository\org\lz4\lz4-java\1.7.1\lz4-java-1.7.1.jar;C:\Users\nragh\.m2\repository\org\xerial\snappy\snappy-java\1.1.8.1\snappy-java-1.1.8.1.jar;C:\Users\nragh\.m2\repository\org\slf4j\slf4j-api\1.7.30\slf4j-api-1.7.30.jar;C:\Users\nragh\.m2\repository\org\apache\commons\commons-lang3\3.14.0\commons-lang3-3.14.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okhttp3\okhttp\4.12.0\okhttp-4.12.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okio\okio\3.6.0\okio-3.6.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okio\okio-jvm\3.6.0\okio-jvm-3.6.0.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-common\1.9.10\kotlin-stdlib-common-1.9.10.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-jdk8\1.8.21\kotlin-stdlib-jdk8-1.8.21.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib\1.8.21\kotlin-stdlib-1.8.21.jar;C:\Users\nragh\.m2\repository\org\jetbrains\annotations\13.0\annotations-13.0.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-jdk7\1.8.21\kotlin-stdlib-jdk7-1.8.21.jar;C:\Users\nragh\.m2\repository\com\google\guava\guava\33.2.1-jre\guava-33.2.1-jre.jar;C:\Users\nragh\.m2\repository\com\google\guava\failureaccess\1.0.2\failureaccess-1.0.2.jar;C:\Users\nragh\.m2\repository\com\google\guava\listenablefuture\9999.0-empty-to-avoid-conflict-with-guava\listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar;C:\Users\nragh\.m2\repository\com\google\code\findbugs\jsr305\3.0.2\jsr305-3.0.2.jar;C:\Users\nragh\.m2\repository\org\checkerframework\checker-qual\3.42.0\checker-qual-3.42.0.jar;C:\Users\nragh\.m2\repository\com\google\errorprone\error_prone_annotations\2.26.1\error_prone_annotations-2.26.1.jar;C:\Users\nragh\.m2\repository\com\google\j2objc\j2objc-annotations\3.0.0\j2objc-annotations-3.0.0.jar;C:\Users\nragh\.m2\repository\com\google\code\gson\gson\2.11.0\gson-2.11.0.jar com.platformatory.App +{"topic1": "users", "topic2": "users-1"} +"users" +bootstrap:localhost:9092,localhost:9093,localhost:9094 +localhost:9092,localhost:9093,localhost:9094 +SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. +Produced event to topic users: key = joe value = myers +Produced event to topic users: key = joe value = crease +Produced event to topic users: key = justie value = myers +Produced event to topic users: key = jill value = hill +Produced event to topic users: key = jill value = hill +Produced event to topic users: key = joe value = hill +Produced event to topic users: key = justie value = hill +Produced event to topic users: key = justie value = crease +Produced event to topic users: key = joe value = myers +... +bootstrap:localhost:9092,localhost:9093 +Need to update bootstrap servers config to {localhost:9092,localhost:9093} and the topic config to {users-1} and create a new Producer +Produced event to topic users-1: key = joe value = crease +Produced event to topic users-1: key = justie value = crease +Produced event to topic users-1: key = joe value = myers +Produced event to topic users-1: key = justie value = crease +Produced event to topic users-1: key = justie value = myers +Produced event to topic users-1: key = jill value = hill +Produced event to topic users-1: key = justie value = hill +Produced event to topic users-1: key = joe value = hill +Produced event to topic users-1: key = jill value = myers +Produced event to topic users-1: key = justie value = hill +Produced event to topic users-1: key = jill value = hill +Produced event to topic users-1: key = joe value = hill +... + +``` + +### Kafka Consumer +Just like the facade created by the Kafka Producer, we do the same here called the ServiceLocatorConsumer, which routes all the calls to the internal Kafka Consumer object. Also, just like the ServiceLocatorProducer, it calls the Catalog Service and obtains the same 2 key properties needed to consume data. + +```java +class BasicInterceptor implements Interceptor { + String credentials; + BasicInterceptor(String id, String password) { + credentials = Credentials.basic(id, password); + } + + @NotNull + @Override + public Response intercept(@NotNull Chain chain) throws IOException { + Request request = chain.request(); + Request.Builder builder = request.newBuilder().header("Authorization", credentials); + return chain.proceed(builder.build()); + } +} + +public class ServiceLocatorConsumer implements Consumer { + private OkHttpClient client; + private LoadingCache> cache; + public static final String CACHE_KEY = "service-map"; + public static final String KAFKA_TOPIC_KEY = "kafka_topic"; + public static final String SERVICE_LOCATOR_BASE_URL = "http://localhost:8000/kafka-service-gw/"; + private Properties properties; + private KafkaConsumer kafkaConsumer; + + public ServiceLocatorConsumer(Map configs) { + kafkaConsumer = new KafkaConsumer<>(configs); + } + + private void initCache() throws ExecutionException { + cache = CacheBuilder.newBuilder() + .maximumSize(100) + .expireAfterWrite(5, TimeUnit.SECONDS) + .build( + new CacheLoader>() { + public Map load(String id) throws IOException { + final Map svcMap = getServiceConfiguration(); + return svcMap; + } + } + ); + } + private void createConsumer(Properties properties) throws ExecutionException { + properties.put(BOOTSTRAP_SERVERS_CONFIG, cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG)); + System.out.println(cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG)); + + this.properties = properties; + + kafkaConsumer = new KafkaConsumer<>(properties); + } + + public ServiceLocatorConsumer(Properties properties) throws ExecutionException { + client = new OkHttpClient.Builder().build(); + + initCache(); + + createConsumer(properties); + } + + public Map getServiceConfiguration() throws IOException { + OkHttpClient client = new OkHttpClient.Builder() + .callTimeout(5, TimeUnit.MINUTES) + .connectTimeout(5, TimeUnit.MINUTES) + .addInterceptor(new BasicInterceptor("user1", "password1")) + .build(); + + Map kafkaSvcLocMap = new HashMap<>(); + + String topic = getTopicConfiguraion(client); + kafkaSvcLocMap.put(KAFKA_TOPIC_KEY, topic); + + String bootstrapServersConfig = getBootstrapServersConfig(client); + + kafkaSvcLocMap.put(BOOTSTRAP_SERVERS_CONFIG, bootstrapServersConfig); + + return kafkaSvcLocMap; + } + + @NotNull + private String getBootstrapServersConfig(OkHttpClient client) throws IOException { + Request request = new Request.Builder() + .url(SERVICE_LOCATOR_BASE_URL + "kafka_clusters?domain=example.org") + .build(); + + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) throw new IOException("Unexpected code " + response); + + JsonObject jsonObject = JsonParser.parseString(response.body().string()).getAsJsonObject(); + System.out.println("bootstrap:" + jsonObject.get("bootstrap_servers").getAsString()); + return jsonObject.get("bootstrap_servers").getAsString(); + } + + @NotNull + private String getTopicConfiguraion(OkHttpClient client) throws IOException { + Request request = new Request.Builder() + .url(SERVICE_LOCATOR_BASE_URL + "channels?channel_name=channel1") + .build(); + + Response response = client.newCall(request).execute(); + if (!response.isSuccessful()) throw new IOException("Unexpected code " + response); + + JsonObject jsonObject = JsonParser.parseString(response.body().string()).getAsJsonObject(); + System.out.println(jsonObject.get("resolved_value").getAsString()); + JsonObject innerObject = JsonParser.parseString(jsonObject.get("resolved_value").getAsString()).getAsJsonObject(); + System.out.println(innerObject.get("topic1")); + return innerObject.get("topic1").getAsString(); + } + + @Override + public ConsumerRecords poll(long l) { + return kafkaConsumer.poll(l); + } + + @Override + public ConsumerRecords poll(Duration duration) { + String bootstrapServers = null; + String topic = null; + try { + bootstrapServers = cache.get(CACHE_KEY).get(BOOTSTRAP_SERVERS_CONFIG); + topic = cache.get(CACHE_KEY).get(KAFKA_TOPIC_KEY); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + + if (!this.properties.get(BOOTSTRAP_SERVERS_CONFIG).equals(bootstrapServers) || + !this.listTopics().containsKey(topic)) { + System.out.printf( + "Need to update bootstrap servers config to %s from %s and create a new Consumer\n", bootstrapServers, properties.get(BOOTSTRAP_SERVERS_CONFIG)); + properties.put(BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + + this.unsubscribe(); + + this.close(); + + kafkaConsumer = new KafkaConsumer<>(properties); + + kafkaConsumer.subscribe(Arrays.asList(topic)); + + } + return kafkaConsumer.poll(duration); + } + +``` + +### Kafka Consumer App +Just like the Kafka Producer App, the Consumer App instantiates the ServiceLocatorConsumer and seamlessly starts consuming from the obtained topic. You can see how the commented out code for the bootstrap servers and the topic show the dynamic nature of the whole system. Also notice that if a configuration change happens during the poll, we automatically refresh the bootstrap servers or the topic configuration and take care of: 1/ updating the bootstrap servers, 2/ on a topic change, we unsubscribe to the previous topics, close the consumer, create a new consumer and subscribe to the new topic(s). Post this, everything gets back to normal. + +```java +public class App +{ + public static void main( String[] args ) throws ExecutionException, InterruptedException { + Properties config = new Properties(); + try { + config.put("client.id", InetAddress.getLocalHost().getHostName()); + } catch (UnknownHostException e) { + throw new RuntimeException(e); + } + + // Not adding the bootstrap.servers config because it will be retrieved automatically + config.put(AUTO_OFFSET_RESET_CONFIG, "earliest"); + config.put(GROUP_ID_CONFIG, "kafka-java-consumer"); + config.put(KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getCanonicalName()); + config.put(VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getCanonicalName()); + + // topic is obtained automatically from the producer and updated if it is different + // when we receive the topic from the ConsumerRecord received from the poll method + + try (final ServiceLocatorConsumer consumer = new ServiceLocatorConsumer<>(config)) { + consumer.subscribe(Arrays.asList(consumer.getTopic())); + while (true) { + ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); + for (ConsumerRecord record : records) { + String key = record.key(); + String value = record.value(); + String topic = record.topic(); + System.out.println( + String.format("Consumed event from topic %s: key = %-10s value = %s", topic, key, value)); + } + } + } + } +} + +``` + +### Kafka Consumer in Action +``` +C:\Users\nragh\.jdks\corretto-11.0.22\bin\java.exe "-javaagent:C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2023.2.2\lib\idea_rt.jar=55738:C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2023.2.2\bin" -Dfile.encoding=UTF-8 -classpath C:\Users\nragh\IdeaProjects\kakfa-consumer\target\classes;C:\Users\nragh\.m2\repository\org\apache\kafka\kafka-clients\7.6.1-ce\kafka-clients-7.6.1-ce.jar;C:\Users\nragh\.m2\repository\io\confluent\telemetry-events-api\7.6.1-ce\telemetry-events-api-7.6.1-ce.jar;C:\Users\nragh\.m2\repository\com\github\luben\zstd-jni\1.5.5-1\zstd-jni-1.5.5-1.jar;C:\Users\nragh\.m2\repository\org\lz4\lz4-java\1.8.0\lz4-java-1.8.0.jar;C:\Users\nragh\.m2\repository\org\xerial\snappy\snappy-java\1.1.10.5\snappy-java-1.1.10.5.jar;C:\Users\nragh\.m2\repository\org\slf4j\slf4j-api\1.7.36\slf4j-api-1.7.36.jar;C:\Users\nragh\.m2\repository\com\fasterxml\jackson\core\jackson-databind\2.11.0\jackson-databind-2.11.0.jar;C:\Users\nragh\.m2\repository\com\fasterxml\jackson\core\jackson-annotations\2.11.0\jackson-annotations-2.11.0.jar;C:\Users\nragh\.m2\repository\com\fasterxml\jackson\core\jackson-core\2.11.0\jackson-core-2.11.0.jar;C:\Users\nragh\.m2\repository\org\apache\commons\commons-lang3\3.14.0\commons-lang3-3.14.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okhttp3\okhttp\4.12.0\okhttp-4.12.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okio\okio\3.6.0\okio-3.6.0.jar;C:\Users\nragh\.m2\repository\com\squareup\okio\okio-jvm\3.6.0\okio-jvm-3.6.0.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-common\1.9.10\kotlin-stdlib-common-1.9.10.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-jdk8\1.8.21\kotlin-stdlib-jdk8-1.8.21.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib\1.8.21\kotlin-stdlib-1.8.21.jar;C:\Users\nragh\.m2\repository\org\jetbrains\annotations\13.0\annotations-13.0.jar;C:\Users\nragh\.m2\repository\org\jetbrains\kotlin\kotlin-stdlib-jdk7\1.8.21\kotlin-stdlib-jdk7-1.8.21.jar;C:\Users\nragh\.m2\repository\com\google\guava\guava\33.2.1-jre\guava-33.2.1-jre.jar;C:\Users\nragh\.m2\repository\com\google\guava\failureaccess\1.0.2\failureaccess-1.0.2.jar;C:\Users\nragh\.m2\repository\com\google\guava\listenablefuture\9999.0-empty-to-avoid-conflict-with-guava\listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar;C:\Users\nragh\.m2\repository\com\google\code\findbugs\jsr305\3.0.2\jsr305-3.0.2.jar;C:\Users\nragh\.m2\repository\org\checkerframework\checker-qual\3.42.0\checker-qual-3.42.0.jar;C:\Users\nragh\.m2\repository\com\google\errorprone\error_prone_annotations\2.26.1\error_prone_annotations-2.26.1.jar;C:\Users\nragh\.m2\repository\com\google\j2objc\j2objc-annotations\3.0.0\j2objc-annotations-3.0.0.jar;C:\Users\nragh\.m2\repository\com\google\code\gson\gson\2.11.0\gson-2.11.0.jar com.platformatory.App +{"topic1": "users", "topic2": "users-1"} +"users" +bootstrap:localhost:9092,localhost:9093,localhost:9094 +localhost:9092,localhost:9093,localhost:9094 +SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder". +SLF4J: Defaulting to no-operation (NOP) logger implementation +SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details. +Consumed event from topic users: key = justie value = myers +Consumed event from topic users: key = justie value = crease +Consumed event from topic users: key = justie value = crease +Consumed event from topic users: key = justie value = crease +Consumed event from topic users: key = justie value = crease +Consumed event from topic users: key = justie value = hill +Consumed event from topic users: key = justie value = myers +Consumed event from topic users: key = justie value = myers +... +bootstrap:localhost:9092,localhost:9093 +Need to update bootstrap servers config to {localhost:9092,localhost:9093} from {localhost:9092,localhost:9093,localhost:9094} and create a new Consumer +Consumed event from topic users-1: key = joe value = crease +Consumed event from topic users-1: key = joe value = myers +Consumed event from topic users-1: key = joe value = hill +Consumed event from topic users-1: key = joe value = hill +Consumed event from topic users-1: key = joe value = hill +Consumed event from topic users-1: key = joe value = myers +Consumed event from topic users-1: key = justie value = crease +Consumed event from topic users-1: key = justie value = crease +Consumed event from topic users-1: key = justie value = myers +Consumed event from topic users-1: key = justie value = hill +Consumed event from topic users-1: key = justie value = hill +... + +``` + +## Python Implementation +Coming soon... + + +# Conclusion + +Creating a system that helps a centralized Kafka infrastructure team to easily create, label and vend information reduces common problems and dependencies. Same benefits are passed on to the producers and consumers thus creating a scalable system/organization. diff --git a/_posts/2024-07-22-Migrating-Zookeeper-to-kraft.md b/_posts/2024-07-22-Migrating-Zookeeper-to-kraft.md new file mode 100644 index 0000000000..68ad92f96b --- /dev/null +++ b/_posts/2024-07-22-Migrating-Zookeeper-to-kraft.md @@ -0,0 +1,225 @@ +--- +layout: post +title: "Migrating Zookeeper to Kraft" +categories: [Kafka Migration, Kafka Architecture, KRaft, Distributed Systems] +teaser: Ready to simplify your Kafka setup? Discover why moving from ZooKeeper to KRaft could be a game-changer for your infrastructure. This guide walks you through each step of the migration process, from retrieving your cluster ID to completing the transition. Unlock the benefits of KRaft and streamline your Kafka management today! +author: Shivaprakash +image: assets/blog-images/migrations/0_TB4m7ZfPMgHF_ljN.png +toc: true +--- + +# **Introduction** + +In its evolution, Kafka has recently shifted away from relying on ZooKeeper to manage its essential metadata and now embraces a quorum-based controller using Kafka Raft, or KRaft (pronounced 'craft'), which has been generally available since Confluent Platform version 7.4. + +Apache Kafka has traditionally entrusted ZooKeeper with crucial tasks such as storing metadata, managing brokers, controlling topics, and overseeing cluster configurations. Initially, this centralized approach with ZooKeeper was ideal for ensuring robust distributed coordination and reliable leader election mechanisms. However, the complexity of maintaining ZooKeeper alongside a Kafka cluster has become evident over time. Moreover, ZooKeeper's limitations have emerged as a bottleneck, restricting the scalability of Kafka clusters by limiting the number of partitions a single broker can effectively manage. + +Recognizing these challenges and seeking to streamline Kafka's architecture, the Kafka community introduced [KIP-500](https://cwiki.apache.org/confluence/display/KAFKA/KIP-500%3A+Replace+ZooKeeper+with+a+Self-Managed+Metadata+Quorum). This initiative aims to phase out ZooKeeper and introduce a more efficient metadata quorum solution. This blog post is dedicated to guiding you through the transition from ZooKeeper to KRaft. We will provide a step-by-step roadmap for migrating your Kafka deployment to KRaft. + +# **Why Bye to ZooKeeper ?** + +ZooKeeper data is replicated across a number of nodes forming an ensemble, using the ZooKeeper Atomic Broadcast (ZAB) protocol to ensure data consistency across all nodes. However, integrating ZooKeeper with Kafka involves managing a separate distributed system alongside Kafka itself, which introduces additional complexities in deployment, management, and troubleshooting. + +Moreover, ZooKeeper introduces scalability bottlenecks that limit the number of topics and partitions supported within a Kafka cluster. For example, during controller failover events, the newly elected controller must retrieve metadata from ZooKeeper, including details for all topics, which can impact performance. Additionally, any metadata updates require propagation to other brokers via RPCs, and as the number of partitions increases, the propagation time grows, potentially slowing down the system. + +In contrast, other distributed systems like MongoDB, Cassandra, and Elasticsearch handle metadata management internally, eliminating the need for external tools like ZooKeeper. This streamlined approach significantly simplifies deployment and operational management—imagine handling just one distributed system instead of two! Furthermore, internal metadata management enhances scalability and efficiency, optimizing operations and providing stronger guarantees for Kafka's functionality. + +# **Hello from KRaft!** + +To address challenges with ZooKeeper, the Kafka community introduced KRaft, a new way to handle metadata directly within Kafka itself. Unlike ZooKeeper, which required managing a separate system alongside Kafka, KRaft integrates metadata management into Kafka using an event-driven approach. + +KRaft uses a quorum-based controller with an event-driven implementation of the Raft protocol. This controller manages an event log stored in a special topic named "__cluster_metadata." Unlike regular topics, data in "__cluster_metadata" is written synchronously to disk, ensuring reliability required by Raft. + +## **Advantages of KRaft** + + + +* Simplicity: KRaft streamlines Kafka’s architecture by eliminating the need for a separate coordination service like ZooKeeper. Users and operators only need to manage one system, and KRaft uses the same configuration, failure handling, and security mechanisms as Kafka’s data plane, making it easier to learn and operate. +* Scalability: KRaft enhances Kafka’s scalability by reducing the load on the metadata store. In KRaft mode, only a subset of brokers, known as the controller quorum, handles metadata. This setup minimizes connections and requests to the metadata store, enabling Kafka to support more brokers and topics without impacting performance. +* Availability: KRaft improves Kafka’s availability by allowing the system to handle partial failures. Only a quorum of controllers is needed to process requests, so if some controllers are down or disconnected, the remaining ones can still maintain cluster operations. This makes Kafka more resilient to network issues and data center failures. +* Simplified Deployment and Management: With KRaft, you no longer need to manage a separate ZooKeeper cluster, reducing operational complexity and costs. Kafka users can continue using existing tools and APIs, such as the Admin API and kafka-reassign-partitions tool, for managing their clusters. +* Increased Security: KRaft supports encryption and authentication for client-server communication using SSL/TLS or SASL, ensuring that Kafka metadata is protected from unauthorized access or tampering. + +By adopting KRaft, Kafka enhances its scalability, simplifies operations, and integrates metadata management directly into its architecture, offering a more efficient alternative to ZooKeeper. + +# **How to migrate from ZooKeeper to KRaft** + +While KRaft has been production-ready for several releases now, with continuous feature enhancements, the majority of existing Kafka clusters still rely on ZooKeeper. However, there's growing interest among users to migrate to KRaft to overcome the limitations associated with ZooKeeper, as discussed earlier. + +Moreover, ZooKeeper will be deprecated within the Kafka project, with support expected to be fully removed in an upcoming release. This development underscores the urgency for Kafka users to plan and execute migrations from ZooKeeper to KRaft. + +Before beginning the migration process, our Kafka brokers are currently operating in ZooKeeper-mode and are connected to the ZooKeeper ensemble where metadata is stored. Let's follow the steps below to migrate to KRaft. + +## **Step 1: Retrieve the cluster ID** + + + +* You must format storage for your Kafka cluster with the ID of the existing cluster. You can get this ID with the zookeeper-shell tool. + + +![Image-2](../assets/blog-images/migrations/Image2.png) + + +## **Step 2: Configure a KRaft controller quorum** + + + +* Set up the KRaft controller quorum responsible for managing metadata in KRaft mode. +* Number of KRaft controller nodes should match the current ZooKeeper ensemble size. +* Migration does not support transitioning to "combined" mode nodes. +* Configure each KRaft controller node with connections to ZooKeeper. +* Add the flag zookeeper.metadata.migration.enable=true to signal the migration intent. +* Upon startup, KRaft controllers establish a quorum and elect a leader. +* Controllers enter a waiting state for Kafka brokers to register. + +``` +process.roles=controller +node.id=3000 +controller.quorum.voters=3000@localhost:9093 +controller.listener.names=CONTROLLER +listeners=CONTROLLER://:9093 +# Enable the migration + zookeeper.metadata.migration.enable=true +# ZooKeeper client configuration + zookeeper.connect=localhost:2181 +# Enable migrations for cluster linking + confluent.cluster.link.metadata.topic.enable=true +# Other configuration entries +``` + + + +## **Step 3: Format storage with the ID you saved previously** + + + +* Format storage on each node with the ID and the controller configuration file. + + +![Image-1](../assets/blog-images/migrations/image1.png) + + +## **Step 4: Start each controller** + + + +* Start each controller, specifying the configuration file with migration enabled. +``` +./bin/kafka-server-start.sh ./etc/kafka/kraft/controller.properties +``` + +## **Step 5: Enable migration on the brokers** + + + +* Modify the Kafka broker configurations to include connection details for all nodes in the KRaft controller quorum. +* Enable the migration process using `zookeeper.metadata.migration.enable=true` in the broker configurations. +* Roll restart each Kafka broker one by one after updating its configuration. +* Upon restart, brokers will register with the KRaft controller quorum instead of ZooKeeper. +* This transition phase is referred to as the "hybrid phase," where some brokers are still using ZooKeeper while the KRaft controller quorum starts managing metadata. +* The KRaft controller leader initiates the migration process by copying all metadata from ZooKeeper to the __cluster_metadata topic in Kafka. +* This ensures that all metadata (broker registrations, topic configurations, etc.) are replicated and synchronized across the KRaft quorum. + + Following is an example configuration file for a broker that is ready for the KRaft migration. + + +``` +broker.id=0 +listeners=PLAINTEXT://:9092 +advertised.listeners=PLAINTEXT://localhost:9092 +listener.security.protocol.map=PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT + +# Set the IBP + inter.broker.protocol.version=3.6 + +# Enable the migration + zookeeper.metadata.migration.enable=true + +# Cluster linking metadata topic enabled + confluent.cluster.link.metadata.topic.enable=true + +# ZooKeeper client configuration + zookeeper.connect=localhost:2181 + +# KRaft controller quorum configuration + controller.quorum.voters=3000@localhost:9093 + controller.listener.names=CONTROLLER +``` + +## **Step 6: Migrate the brokers** + + + +* The KRaft controller operates in migration mode, sending RPCs like UpdateMetadata and LeaderAndIsr to ZooKeeper-mode brokers. +* Update broker configuration to switch from ZooKeeper-mode to KRaft-mode. +* Replace `broker.id` with `node.id` while maintaining the same identifier. +* Add `process.roles=broker` to indicate KRaft mode. +* Remove ZooKeeper configuration and any cluster linking entries from broker configuration. +* Update ACLs if necessary, especially changing the authorizer class. +* Restart the broker to apply the new configuration and complete the migration process. + + Following is an example of how a server.properties file for a migrated broker might look. Note that ZooKeeper-specific properties are commented out. + + +``` +process.roles=broker +node.id=0 +listeners=PLAINTEXT://:9092 +advertised.listeners=PLAINTEXT://localhost:9092 +listener.security.protocol.map=PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT + +# Remove the IBP, KRaft uses "metadata.version" feature flag +# inter.broker.protocol.version=3.6 + +# Remove the migration enabled flag +# zookeeper.metadata.migration.enable=true + +# Remove the cluster linking metadata topic setting +# confluent.cluster.link.metadata.topic.enable=true + +# Remove ZooKeeper client configuration +# zookeeper.connect=localhost:2181 + +# Keep the KRaft controller quorum configuration + controller.quorum.voters=3000@localhost:9093 + controller.listener.names=CONTROLLER + +# If using ACLs, change the authorizer from AclAuthorizer used for ZooKeeper to the StandardAuthorizer used for KRaft. + authorizer.class.name=kafka.security.authorizer +``` + + +## **Step 7: Take KRaft controllers out of migration mode** + + + +* Disable Migration: Comment out or remove the zookeeper.metadata.migration.enable=true property. This signals that the controllers are no longer in migration mode and are fully operational in KRaft mode. +* Remove ZooKeeper Configuration: Comment out or remove any ZooKeeper-specific configuration entries (zookeeper.connect=localhost:2181 in this example). Since KRaft mode no longer relies on ZooKeeper for metadata management, these entries are no longer needed. + + Example `controller.properties` File for KRaft Mode + + +``` +process.roles=controller +node.id=3000 +controller.quorum.voters=3000@localhost:9093 +controller.listener.names=CONTROLLER +listeners=CONTROLLER://:9093 + +# Disable migration +# zookeeper.metadata.migration.enable=true + +# Remove ZooKeeper client configuration. +# zookeeper.connect=localhost:2181 +``` + + + +# **Conclusion** + +In this article, we've explored the role of ZooKeeper in Kafka and why Kafka has opted to transition to a quorum-based controller, KRaft. The Apache Kafka community has deprecated ZooKeeper for storing cluster metadata, with plans for complete removal in the upcoming version release. This signifies that users will need to transition towards KRaft-based clusters in the near future. Given the prevalence of ZooKeeper-based clusters currently in production, migrating to KRaft involves significant manual intervention, including configuration updates and rolling node restarts. + + + + + diff --git a/_posts/2024-07-23-Real-time-data-architecture-in-retail.md b/_posts/2024-07-23-Real-time-data-architecture-in-retail.md new file mode 100644 index 0000000000..f1f3bf966d --- /dev/null +++ b/_posts/2024-07-23-Real-time-data-architecture-in-retail.md @@ -0,0 +1,505 @@ +--- +layout: post +title: "Real Time Data Architecture In Retail" +authors: ashwin +categories: [Platform Engineering, Infrastructure, Kafka, Data, Retail] +image: assets/blog-images/real_time_retail/RealTimeRetailTitle.jpg +featured: false +hidden: false +teaser: Discover how real-time data architecture can revolutionize retail operations, from customer engagement to dynamic pricing and inventory management. +toc: true +--- + +# Introduction + +Retail industry is one of the fastest growing sector in terms of data and analytics. According to the recent study from [Fortune Business Insights](https://www.fortunebusinessinsights.com/industry-reports/retail-analytics-market-101273), the global retail analytics market size was valued at USD 7.56 billion in 2023 and is projected to grow to USD 31.08 billion by 2032, exhibiting a CAGR of 17.2%. + +Although, a lot of money is pumped into analytics only a few selected companies like Walmart, Amazon etc. hold the majority of the market share. The reason being they are equipped to make many important decisions based on an ever-growing supply of _real-time_ and historical data while most of their competitors still use very basic tools that are far better able at tracking where they’ve *been* than where they should be *going*. During the pandemic, [McKinsey estimates](https://www.mckinsey.com/industries/retail/our-insights/jumpstarting-value-creation-with-data-and-analytics-in-fashion-and-luxury), the 25 top-performing retailers — most of whom are digital leaders — were 83% more profitable than laggards and took home more than 90% of the sector’s gains in market capitalization. + +# Why Real time data is important in Retail? + +Retail customers are generally fickle, and their preferences can change in an instant. Each moment is a point in time when a customer interacts with a brand to get what they want immediately and in context. Real-time data allows retailers to respond swiftly to changes in customer behaviour, offering personalized experiences that can significantly enhance customer satisfaction. For example, Amazon’s recommendation engine, accounts for an impressive 35% of their total revenue. Amazon has achieved a substantial increase in ROI through customized homepage displays, advanced analysis of browsing and purchase history tied to the customer’s Amazon account, and strategic email marketing campaigns, + +![RealTimeRetail1.png](../assets/blog-images/real_time_retail/RealTimeRetail1.png) + +By leveraging real-time analytics, companies can optimize inventory management, reduce stockouts, and improve the efficiency of supply chain operations. This not only boosts sales and reduce Customer Acquisition Costs (CAC) but also helps in retaining customer loyalty. A study by IDC highlights that retailers harnessing real-time data analytics can slash their inventory costs by up to 15% and increase their sales by up to 10%. Providing a seamless and personalized shopping experience across channels in real-time contributes to increasing Customer Lifetime Value (CLV). + +# Maslow’s Hierarchy of data management in Retail + +![RealTimeRetail2.png](../assets/blog-images/real_time_retail/RealTimeRetail2.png) + +Top 5 Foundational Capabilities for Managing Data in Retail Leading Up to Direct Impact on Customer Experience + +## 1. Inventory Visibility + +Achieving end-to-end supply chain visibility involves having real-time insights into the entire journey of products. According to CSCMP's [2023 State of Logistics Report](https://cscmp.org/CSCMP/Resources/Reports_and_Surveys/State_of_Logistics_Report/CSCMP/Educate/State_of_Logistics_Report.aspx?hkey=bdfd8da6-e34f-434c-b39c-d3219dd4a6a2), in 2022, the U.S. Business Logistics Cost went up as much as 19.6%, the largest increase to date, of which 52% was due to an increase in inventory carrying costs. In this economic climate, stocking shelves inadequately, letting down consumers and misjudging demand can lead to catastrophic consequences + +Inventory visibility is crucial for managing stock levels effectively and ensuring that products are available when customers need them. Logging in orders and sales data through various tools allows retailers to react quicker based on the data. This leads to optimized inventory levels, improved customer satisfaction, and reduced costs associated with excess inventory, stockouts, and rush shipments. + +## 2. Omnichannel Integration + +Shoppers now have more flexibility and can chose different formats for both shopping and fulfilment. Some shoppers tend to go online in search of information, and then actually conclude the transaction in a physical store. Others might go to a store or a showroom to do research, and then they go online to make their purchase. Retailers today have to understand and enable this flexibility and seamless movement across channels + +Therefore, a seamless data integration across multiple operational channels, such as online stores, physical shops, and mobile apps is paramount. By synchronizing data across all channels, retailers can provide a consistent and unified shopping experience for customers. This integration helps in managing inventory, customer preferences, and sales data more effectively, leading to better customer experiences. + +## 3. Dynamic Pricing + +Dynamic pricing makes sense for staying relevant in retail where competition is ever-growing because it allows retailers to adjust prices in real-time based on a variety of factors such as customer buying behaviour, seasonal trends, competitor pricing, and product performance. This flexibility ensures that prices remain competitive and appealing to customers, helping to maximize sales and profitability. In a market where consumer preferences and competitive landscapes can change rapidly, dynamic pricing helps retailers react swiftly to these changes, offering the right price at the right time to attract and retain customers. + +## 4. Fulfillment Speed + +Fulfillment speed is a critical factor in customer satisfaction. Retailers can improve fulfillment speed through dynamic routing based on weather, traffic conditions, and automation. Real-time tracking of orders ensures that customers are informed about the status of their purchases, leading to a better shopping experience. Faster fulfillment also reduces the likelihood of abandoned carts and enhances customer loyalty. + +Accurately forecasting demand and supply enables retailers to manage inventory more efficiently, ensuring that popular products are stocked and ready for quick dispatch. Effective demand and supply forecasting also allows for better workforce planning and allocation of resources, leading to faster order processing and quicker delivery times. + +## 5. Deep Personalization + +Deep personalization is the ultimate goal in retail because it directly impacts customer satisfaction, loyalty, and ultimately, the bottom line. By providing highly personalized experiences, retailers can create deeper emotional connections with their customers, making them feel valued and understood. + +The ability to dynamically adjust content, offers, and recommendations based on real-time data ensures that customers receive the most relevant information at any given moment, which is critical for maintaining customer engagement. Utilizing sophisticated algorithms and machine learning models helps in accurately predicting customer preferences and behaviors, leading to more effective personalization strategies. + +# Barebone Streaming Architecture in Retail + +![RealTimeRetail3.png](../assets/blog-images/real_time_retail/RealTimeRetail3.png) + +In today's fast-paced retail environment, leveraging real-time data architecture is crucial for staying ahead of the competition. Here are the key components of real-time data architecture in retail: + +## Data Sources + +The first step in building a real-time data architecture is identifying and integrating various data sources. These can include point-of-sale (POS) systems, e-commerce platforms (clickstream etc), social media feeds, CDP etc. Capabilities to collect data across both physical and digital stores is an important requirement. + +## Event Bus + +A centralized event bus is essential for managing the flow of data between different systems and components. It serves as a backbone for real-time data pipelines, ensuring efficient data transfer and integration. + +**Apache Kafka** is an ideal choice for this purpose. It allows for high-throughput, low-latency data streaming and can handle large volumes of events in real-time. This capability is particularly critical for retailers who need to quickly respond to changes in customer behaviour, manage inventory in real-time, and provide dynamic pricing updates. Kafka also supports a wide range of connectors for external systems enabling effective data ingress and egress capabilities. + +## Stream Processing Layer + +Real-time data processing is crucial for extracting actionable insights. Technologies like Apache Flink, Kafka Streams, and Storm enable continuous data processing. Ingested data can be transformed, filtered and aggregated on the fly for downstream processing services. Data can be enriched with historical information through joins which might be relevant for further processing. + +Additionally, Real time predictions can be captured for the processed data points and stored in a database for retrieval. Real-time predictions allow for instant decision-making based on the most recent data realising maximum data impact value. + +## Decision-making Microservices + +Intelligent decision-making microservices are the brain of the real-time data architecture. These microservices use the processed data to make real-time decisions that can significantly impact retail operations. For example, a pricing microservice can dynamically adjust product prices based on current demand, competitor pricing, and inventory levels. Similarly, a recommendation microservice can provide personalized product suggestions to customers based on their browsing and purchase history. + +A decision-making microservice can range from a simple rule based service to deep learning models. These microservices are designed to learn and improve over time, becoming increasingly intelligent and effective through the usage of real time features. A maturity lifecycle for a microservice can include the following steps, + +- CEP/Rule-driven +- Supervised Learning models +- Ensemble models +- Deep Learning models + +## Data Analytics and Visualization + +The final component is data analytics and visualization, which provides a centralized dashboard for analyzing customer behavior in real-time. This dashboard allows retailers to monitor key performance indicators (KPIs), track trends, and gain insights into customer preferences and market dynamics. Retailers can also use these tools to track customer engagement across various channels, allowing them to tailor marketing strategies and improve customer experiences. Real-time visualization tools help in making data-driven decisions quickly and effectively. + +# Example implementation + +Let’s see an example on how to implement the mentioned Real time data architecture. [Link](https://github.com/Platformatory/real-time-retail) for the the Github repository. + +## Objective: + +Based on the Shopify inventory changes and clickstream data dynamically predict the price and the sell through date for the available products in real time. + +## Prerequisites: + +- Shopify Store +- Docker +- ngrok + +## Details: + +- Kafka Broker will be used as the Event bus for the inventory and clickstream data +- Kafka connect will be used to source the data from the data sources and get the predictions from the microservices +- KSQL will be used to perform ETL on the source data for downstream predictions +- Flask APIs will be used to expose endpoints to talk to prediction microservices + +## Setup the infrastructure: + +1. We will run all the required components as docker containers locally. Following is the `docker-compose.yaml` for the required services. + +```yaml +--- +version: "2" +services: + broker: + image: confluentinc/cp-kafka:7.5.0 + hostname: broker + container_name: broker + ports: + - "9092:9092" + - "9101:9101" + environment: + KAFKA_NODE_ID: 1 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT" + KAFKA_ADVERTISED_LISTENERS: "PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092" + #KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter + #KAFKA_CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: 'broker:9092' + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9101 + KAFKA_JMX_HOSTNAME: localhost + KAFKA_PROCESS_ROLES: "broker,controller" + KAFKA_CONTROLLER_QUORUM_VOTERS: "1@broker:29093" + KAFKA_LISTENERS: "PLAINTEXT://broker:29092,CONTROLLER://broker:29093,PLAINTEXT_HOST://0.0.0.0:9092" + KAFKA_INTER_BROKER_LISTENER_NAME: "PLAINTEXT" + KAFKA_CONTROLLER_LISTENER_NAMES: "CONTROLLER" + KAFKA_LOG_DIRS: "/tmp/kraft-combined-logs" + # Replace CLUSTER_ID with a unique base64 UUID using "bin/kafka-storage.sh random-uuid" + # See https://docs.confluent.io/kafka/operations-tools/kafka-tools.html#kafka-storage-sh + CLUSTER_ID: "41PmKs1mQiGJK0U_Ul45OA" + + connect: + build: + context: . + dockerfile: Dockerfile + hostname: connect + container_name: connect + ports: + - 8083:8083 + - 8000:8000 + depends_on: + - broker + - api + environment: + CONNECT_BOOTSTRAP_SERVERS: "broker:29092" + CONNECT_REST_ADVERTISED_HOST_NAME: connect + CONNECT_GROUP_ID: compose-connect-group + CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs + CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 + CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets + CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status + CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter + CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter + # CLASSPATH required due to CC-2422 + CLASSPATH: /usr/share/java/monitoring-interceptors/monitoring-interceptors-7.5.0.jar + CONNECT_PRODUCER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringProducerInterceptor" + CONNECT_CONSUMER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringConsumerInterceptor" + CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components" + CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR + volumes: + - ./retail_clickstream_schema.avro:/home/appuser/retail_clickstream_schema.avro + + ksqldb-server: + image: confluentinc/cp-ksqldb-server:7.5.0 + hostname: ksqldb-server + container_name: ksqldb-server + depends_on: + - broker + - connect + ports: + - "8088:8088" + environment: + KSQL_CONFIG_DIR: "/etc/ksql" + KSQL_BOOTSTRAP_SERVERS: "broker:29092" + KSQL_HOST_NAME: ksqldb-server + KSQL_LISTENERS: "http://0.0.0.0:8088" + KSQL_CACHE_MAX_BYTES_BUFFERING: 0 + KSQL_PRODUCER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringProducerInterceptor" + KSQL_CONSUMER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringConsumerInterceptor" + KSQL_KSQL_CONNECT_URL: "http://connect:8083" + KSQL_KSQL_LOGGING_PROCESSING_TOPIC_REPLICATION_FACTOR: 1 + KSQL_KSQL_LOGGING_PROCESSING_TOPIC_AUTO_CREATE: "true" + KSQL_KSQL_LOGGING_PROCESSING_STREAM_AUTO_CREATE: "true" + # KSQL_KSQL_QUERIES_FILE: /home/appuser/ksql_queries.sql + volumes: + - ./ksql_queries.sql:/home/appuser/ksql_queries.sql + + ksqldb-cli: + image: confluentinc/cp-ksqldb-cli:7.5.0 + container_name: ksqldb-cli + depends_on: + - broker + - connect + - ksqldb-server + entrypoint: /bin/sh + tty: true + + api: + build: + context: ./api + dockerfile: Dockerfile + hostname: api + container_name: api + ports: + - 5000:5000 +``` + +1. Create a `ShopifyWebhookConnector` to fetch the inventory changes from Shopify. Since the setup is local, we need to expose the endpoint publicly using `ngrok` so that Shopify can send data on updates in the Store. + +```bash +export NGROK_PUBLIC_URL=`curl -s localhost:4040/api/tunnels | jq -r '.tunnels[0].public_url'` +``` + +```json +{ + "name": "$CONNECTOR_NAME", + "config": { + "connector.class": "com.platformatory.kafka.connect.ShopifyWebhookConnector", + "tasks.max": 1, + "topic.default": "webhook", + "topic.header": "X-Shopify-Topic", + "topic.prefix": "plf_", + "key.json.path": "$.id", + "schema.infer": false, + "validator.class": "com.platformatory.kafka.connect.ShopifyRequestValidator", + "port": 8000, + "shopify.access.token": "$SHOPIFY_ACCESS_TOKEN", + "shopify.webhook.create": true, + "shopify.store.name": "$SHOPIFY_STORE_NAME", + "shopify.webhook.topics": "products/update", + "shopify.apisecret": "$SHOPIFY_API_SECRET", + "shopify.connector.hostname": "$NGROK_PUBLIC_URL" + } +} +``` + +1. We will use mock data for the clickstream data. This mock data will be generated using the `DatagenConnector` with a pre-defined schema type. + +```bash +{ + "name": "$CONNECTOR_NAME", + "config": { + "connector.class": "io.confluent.kafka.connect.datagen.DatagenConnector", + "tasks.max": "1", + "kafka.topic": "shopify_clickstream", + "schema.filename": "/home/appuser/retail_clickstream_schema.avro", + "schema.keyfield": "activity", + "topic.creation.default.partitions": 6, + "topic.creation.default.replication.factor": 1, + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter.schemas.enable": "false" + } +} +``` + +1. Load both the shopify inventory data and clickstream data as STREAMS in KSQL + +```sql +CREATE OR REPLACE STREAM PRODUCTS_UPDATE_STREAM + (PAYLOAD STRUCT< + id BIGINT, + created_at STRING, + inventory_management STRING, + title STRING, + vendor STRING, + variants Array< + Struct< + id BIGINT, + created_at STRING, + title STRING, + inventory_item_id BIGINT, + inventory_quantity BIGINT, + old_inventory_quantity BIGINT, + sku STRING, + price STRING, + product_id BIGINT + > + > + > + ) WITH (KAFKA_TOPIC='plf_products_update', KEY_FORMAT='KAFKA', VALUE_FORMAT='JSON'); + +CREATE OR REPLACE STREAM CLICKSTREAM_STREAM ( + time STRING, + user_id BIGINT, + product_variant_id BIGINT, + activity STRING, + ip STRING +) WITH (KAFKA_TOPIC='shopify_clickstream', KEY_FORMAT='KAFKA', VALUE_FORMAT='JSON'); +``` + +1. Create a transformed inventory stream called `PRODUCTS_UPDATES` with only with the required information + +```sql +CREATE OR REPLACE STREAM PRODUCTS_UPDATES + WITH (KAFKA_TOPIC='ksql_product_updates', PARTITIONS=6, REPLICAS=1) AS SELECT + PAYLOAD->id id, + PAYLOAD->created_at created_at, + PAYLOAD->inventory_management inventory_management, + PAYLOAD->title title, + PAYLOAD->vendor vendor, + EXPLODE(PAYLOAD->variants) variant + FROM PRODUCTS_UPDATE_STREAM + EMIT CHANGES; +``` + +1. For every product variant in clickstream, aggregate the number of product views and create a `KTable` + +```sql +CREATE OR REPLACE TABLE CLICKSTREAM_ACTIVITY +WITH (KAFKA_TOPIC='ksql_clickstream_activity', PARTITIONS=6, REPLICAS=1, KEY_FORMAT='JSON') +AS +SELECT + product_variant_id product_variant_id, + COUNT(activity) activity_count +FROM CLICKSTREAM_STREAM +WHERE activity='contentView' +GROUP BY product_variant_id +EMIT CHANGES; +``` + +1. Join the aggregated clickstream data and inventory data for each product variant and write to a output topic + +```sql +CREATE OR REPLACE STREAM PRODUCT_CLICKSTREAM +WITH (KAFKA_TOPIC='ksql_product_clickstream', PARTITIONS=6, REPLICAS=1) +AS +SELECT p.id product_id, + p.created_at product_created_at, + p.inventory_management inventory_management, + p.title product_title, + p.vendor product_vendor, + p.variant->created_at variant_created_at, + p.variant->inventory_item_id variant_inventory_item_id, + p.variant->old_inventory_quantity old_inventory_quantity, + p.variant->inventory_quantity inventory_quantity, + p.variant->sku variant_sku, + p.variant->price price, + c.product_variant_id variant_id, + c.activity_count activity_count +FROM PRODUCTS_UPDATES p INNER JOIN CLICKSTREAM_ACTIVITY c ON ((p.variant->id = c.product_variant_id)) EMIT CHANGES; +``` + +1. Sample microservices which returns predictions for price and sell through date will be exposed via Flask APIs. Microservices are just dummy functions for this example implementation. + +```python +from flask import Flask,request,jsonify +from datetime import timedelta,datetime,date +from random import randrange +import ast + +app = Flask(__name__) + +@app.route('/') +def index(): + return 'Index Page' + +@app.route('/user', methods=["POST"]) +def get_user(): + data = request.get_json() + username = data['username'] + password = data['password'] + print({username, password }) + return {"username":username, password: password} + +def predict_sell_through(inventory_data): + average_daily_sales = inventory_data['total_sales'] / inventory_data['total_days'] + days_until_sell_through = inventory_data['quantity_on_hand'] / average_daily_sales + sell_through_date = datetime.now() + timedelta(days=days_until_sell_through) + inventory_doh = inventory_data['quantity_on_hand'] / average_daily_sales + return sell_through_date, inventory_doh + +@app.route('/stores//inventory_forecast/predict', methods=["POST"]) +def show_post(store_name): + try: + inventory_data = request.json + if not inventory_data: + return jsonify({'error': 'No data provided'}), 400 + + + sell_through_date, inventory_doh = predict_sell_through(inventory_data) + print(sell_through_date, inventory_doh, inventory_data) + return jsonify({ + 'sell_through_date': sell_through_date.strftime('%Y-%m-%d'), + 'inventory_doh': int(inventory_doh) + }) + except Exception as e: + return jsonify({"error": str(e)}) + +@app.route('/dynamic-pricing', methods=['POST']) +def dynamic_pricing(): + try: + data = request.get_json()[0] + print(data) + if data["ACTIVITY_COUNT"]>3: + data["dynamic_price"] = float(data["PRICE"])*1.1 + else: + data["dynamic_price"] = float(data["PRICE"])*1.01 + + return data, 200 + + except Exception as e: + # Handle any exceptions + print(f"Error processing request: {str(e)}") + return jsonify({'error': 'An error occurred'}), 500 + +@app.route('/sell-through', methods=['POST']) +def sell_through_date(): + try: + data = request.get_json()[0] + print(data) + # date + random days (5-10) + data["sell_through_date"] = datetime.now() + timedelta(days=randrange(10)) + + return data, 200 + + except Exception as e: + # Handle any exceptions + print(f"Error processing request: {str(e)}") + return jsonify({'error': 'An error occurred'}), 500 + +if __name__ == "__main__": + app.run(host='0.0.0.0', port=5000) + +# curl -X POST http://localhost:5000/stores/acme/inventory_forecast/predict -H 'Content-Type:application/json' -d '{"total_sales":453,"total_days":32,"quantity_on_hand":98,"current_date":"2024-04-10T15:17"}' +``` + +1. Create 2 `HttpSinkConnector` which will get the price and sell through predictions for each enriched data from the KSQL output topic and write it to respective success topics. + +```json +## Dynamic pricing +{ + "name": "$CONNECTOR_NAME", + "config": { + "connector.class": "io.confluent.connect.http.HttpSinkConnector", + "tasks.max":1, + "http.api.url":"http://api:5000/dynamic-pricing", + "headers":"Content-Type:application/json|Accept:application/json", + "request.body.format": "json", + "reporter.result.topic.replication.factor":1, + "reporter.error.topic.replication.factor":1, + "reporter.bootstrap.servers": "broker:29092", + "confluent.topic.bootstrap.servers": "broker:29092", + "confluent.topic.replication.factor": "1", + "topics": "ksql_product_clickstream", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter.schemas.enable": "false" + } +} + +## Sell through date +{ + "name": "$CONNECTOR_NAME", + "config": { + "connector.class": "io.confluent.connect.http.HttpSinkConnector", + "tasks.max":1, + "http.api.url":"http://api:5000/sell-through", + "headers":"Content-Type:application/json|Accept:application/json", + "request.body.format": "json", + "reporter.result.topic.replication.factor":1, + "reporter.error.topic.replication.factor":1, + "reporter.bootstrap.servers": "broker:29092", + "confluent.topic.bootstrap.servers": "broker:29092", + "confluent.topic.replication.factor": "1", + "topics": "ksql_product_clickstream", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter.schemas.enable": "false" + } +} +``` + +In this example, we were able to demonstrate a sample real time data pipeline for a retail use case where we are predicting price and sell through date based on product inventory and clickstream activity. + +# Conclusion + +The implementation of a real-time data architecture in retail is essential for gaining a competitive edge in today's fast-paced market. By leveraging real-time data, retailers can enhance customer satisfaction through dynamic routing, accurate demand forecasting, and deep personalization. As the retail landscape continues to evolve, investing in real-time data capabilities will be crucial for optimizing operations, improving customer experiences, and driving business growth. diff --git a/_posts/2024-07-24-Use-OAuth-OIDC-to-Authenticate-to-Confluent-Cloud.md b/_posts/2024-07-24-Use-OAuth-OIDC-to-Authenticate-to-Confluent-Cloud.md new file mode 100644 index 0000000000..894eefc68e --- /dev/null +++ b/_posts/2024-07-24-Use-OAuth-OIDC-to-Authenticate-to-Confluent-Cloud.md @@ -0,0 +1,280 @@ +--- +layout: post +title: "Seamless Authentication to Confluent Cloud Using OAuth/OIDC with Okta as Identity Provider" +categories: [Kafka Architecture, Distributed Systems, Security] +teaser: Want to streamline your Kafka authentication? Learn how to effortlessly integrate Confluent Cloud with OAuth/OIDC using Okta as your identity provider. This guide covers everything you need to set up secure and seamless authentication, ensuring smooth access to your Kafka resources. Discover the power of modern authentication and simplify your Kafka management today! +author: Shivaprakash +image: assets/blog-images/oauth-oidc-blog/Oauth.jpeg +toc: true +--- + +# **OAuth2: Essential Insights You Should Know** + +In the digital age, securing access to sensitive data and resources is paramount. Traditional authentication methods often require users to share their long-term credentials with third-party applications, posing significant security risks. Enter OAuth 2.0, a powerful authorization framework designed to address these concerns by allowing users to grant third-party websites or applications access to their protected resources without revealing their long-term credentials or identity. + +OAuth 2.0 introduces an innovative authorization layer that separates the role of the client (the application requesting access) from the resource owner (the user). In this framework, the client requests access to resources controlled by the resource owner and hosted by the resource server. Instead of using the resource owner's credentials, the client is issued an Access Token— a string that defines the specific scope, lifetime, and other access attributes. This token is issued by an authorization server with the user's approval, enabling the client to access the protected resources on the resource server securely. + +Platforms like Auth0 further enhance this process by generating access tokens in JSON Web Token (JWT) format for API authorization scenarios. The permissions associated with these access tokens, known as scopes, define the level of access granted to the application. When an application authenticates with Auth0, it specifies the desired scopes. If these scopes are authorized by the user, the access token will represent these authorized permissions, ensuring secure and granular access control. + +In this blog post, we will discuss how configuring identity providers on Confluent Cloud allows you to manage application access without depending on user accounts. This section covers various aspects of configuring and managing identity providers for OAuth/OIDC. + +# **Pros and Cons of Using OAuth** + +## **Pros** + +**Security:** +- Reduces the risk associated with sharing long-term credentials. +- Limits the access granted to third-party applications, minimizing potential damage from compromised tokens. + +**Granular Access Control:** +- Users can grant specific permissions (scopes) to third-party applications. +- Tokens can have limited lifetimes and scopes, offering fine-grained control over access. + +**User Experience:** +- Simplifies the process of granting access to third-party applications. +- Users authenticate with a trusted authorization server, improving confidence in the security of their credentials. + +## **Cons** + +**Complexity:** +- Setting up OAuth can be complex, especially if you’re not familiar with it. +- The configuration involves multiple components such as authorization servers, scopes, tokens, and client credentials, which can be challenging to manage without prior experience. + +**Dependency:** +- OAuth relies on an identity provider for authentication. +- This dependency introduces additional points of failure. If the identity provider experiences downtime or issues, it can affect the entire authentication flow. This identity provider could be any service, not necessarily Okta, and the reliance on this external service adds a layer of dependency that needs to be managed carefully. + +## **When to Use OAuth** +- When you require secure, token-based authentication to manage access to resources. +- If you are using Confluent Cloud in an enterprise setting where centralized identity management is important, and you need to ensure that access control is streamlined and consistent across multiple systems and applications. +- When integrating with systems that support OAuth for seamless interoperability and access control. + +# **Let's Dive into Configuring Okta!** + +## **Step 1 : Application integration in Okta** + +By setting up an application in Okta, you configure the authentication methods, such as OpenID Connect (OIDC), that will be used to secure the connection to your Confluent Cloud cluster. This ensures that only authorized clients can access the resources.The setup process in Okta generates a Client ID and Client Secret, which are essential for authenticating your application with Confluent Cloud. These credentials are used to request access tokens, which are then used to access Confluent Cloud resources securely. + +![Image1](../assets/blog-images/oauth-oidc-blog/application.png) + + **How to Set Up Application Integration in Okta?** + +1. Log in to Okta Admin Console Access your Okta Admin Console at https://{yourOktaDomain}/admin. +2. Navigate to Applications: From the left-hand menu, click on "Applications" and then select "Applications" again. +3. Create a New Application Integration: Click on "Create App Integration." +4. Choose Application Type: Select "**OIDC - OpenID Connect**" for applications using OAuth 2.0 or OpenID Connect, then click "Next." +5. Configure Application Settings: Enter a descriptive name for your application (e.g., Kafka Client) and choose "Client Credentials" as the sign-on method. +6. Set Additional Options: Ensure "Client Credentials Grant" is selected. Configure or leave "Redirect URIs" and "Logout Redirect URIs" as needed. +7. Complete the Setup: Click "Save" to finalize the configuration. +8. Obtain Client Credentials: After saving, go to the application's "General" tab to find and record the Client ID and Client Secret. + +## **Step 2: Configure the Authorization Server** + + An authorization server is a key component in the OAuth 2.0 and OpenID Connect (OIDC) frameworks. Its primary function is to issue tokens that control access to protected resources and enable user authentication. By providing tokens like access tokens and ID tokens, the authorization server manages and secures access to various services. It enforces access policies to determine which resources can be accessed and under what conditions. Each authorization server has a unique issuer URI and a specific signing key to ensure the security and integrity of the tokens it issues. This setup helps maintain clear security boundaries and enhances overall protection. + +**In Okta, configuring the authorization server allows you to:** + +- **Create Access Policies**: Set rules to define who can access which resources and under what conditions. +- **Create Rules for Each Access Policy**: Specify detailed rules that determine how access policies are enforced. +- **Create Scopes**: Define the permissions that applications can request, specifying the level of access they are granted. +- **Create Claims**: Specify the attributes included in tokens, such as user roles or email addresses, to provide essential information for authorization decisions. + + By configuring these elements, you ensure that tokens are aligned with your security policies and access requirements, allowing for effective control over resource access. + +![Image2](../assets/blog-images/oauth-oidc-blog/authorization.png) + +# **Integrating Okta with Confluent Cloud: A Configuration Walkthrough** + + **Steps to Configure an Okta Identity Provider in Confluent Cloud** + +To configure Confluent Cloud we have to first register the IdP. +Go to Accounts & access from the menu in the top right, and select the Identity providers tab. + +![Image3](../assets/blog-images/oauth-oidc-blog/idp.png) + +Select the **Add Identity Providers** button and choose Okta, then press **Next**. + +![Image4](../assets/blog-images/oauth-oidc-blog/okta.png) + +In next screen enter the details for the okta : + +- **Name**: Enter a meaningful name for your Okta identity provider. +- **Description**: Provide relevant information for managing this provider. +- **Authorization Server**: Enter the identifier for your Okta authorization server. The default is "default," but this can be modified if needed. +- **Domain**: Enter the domain for your Okta instance. The default is {yourDomain}.okta.com. Replace {yourDomain} with your actual Okta domain name. +- **OIDC Discovery URL**: Click to import metadata required for configuring your OIDC provider. This action will populate the JWKS URI and Issuer URI fields automatically. +- **JWKS URI**: Enter the URI for the JSON Web Key Set (JWKS). +- Example: https://mycompany.okta.com/oauth2/default/v1/keys +- **Issuer URI**: Enter the issuer URI for your Okta identity provider. +- Example: https://mycompany.okta.com/oauth2/default + +![Image5](../assets/blog-images/oauth-oidc-blog/idp2.png) + +This creates the provider, but we still have to configure identity pools. Do this by clicking the identity provider link.From this screen, we can create a new identity pool by pressing the **Add pool** button. + +![Image7](../assets/blog-images/oauth-oidc-blog/pool.png) + +**Configure your identity pool and access policy.** + +- An *identity pool* is a group of external identities that are assigned a certain level of access based on policy. \ +For details, see [Use Identity Pools with Your OAuth/OIDC Identity Provider on Confluent Cloud](https://docs.confluent.io/cloud/current/security/authenticate/workload-identities/identity-providers/oauth/identity-pools.html#add-oauth-identity-pools). + +- In the example below, we are assigning the Cluster Admin role to the identity pool pool-mPqE. This means that users accessing this cluster through the pool will have full Admin permissions and will be able to create, delete, and edit any resources. + +![Image8](../assets/blog-images/oauth-oidc-blog/configurerole.png) + +# **Configuring Your Kafka Client** + +## **Set Client ID and Client Secret** + +Use the Client ID and Client Secret from Okta to configure your Kafka client. + +## **Client Configuration in Kafka** + +Update your Kafka client configuration to include these settings: \ +properties + +``` +bootstrap.servers= +security.protocol=SASL_SSL +sasl.mechanism=OAUTHBEARER +sasl.oauthbearer.token.endpoint.url=https://{yourOktaDomain}/oauth2/default/v1/token +sasl.jaas.config= \ +org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required \ +clientId='' \ +clientSecret='' \ +scope='openid' \ + extension_logicalCluster='' \ + extension_identityPoolId=''; + +``` + +## **Request a JSON Web Token (JWT)** + +**Authenticate using Client Credentials:** + +Use the Client ID and Client Secret to request a JWT from Okta via the OAuth 2.0 Client Credentials Grant.The response will contain an access_token which is a JWT. + +``` +curl --request POST \ + --url https://{yourOktaDomain}/oauth2/default/v1/token \ + --header 'Content-Type: application/x-www-form-urlencoded' \ + --data 'grant_type=client_credentials&client_id=&client_secret=&scope=openid' + +``` +Example: + +![Image9](../assets/blog-images/oauth-oidc-blog/cctoken.png) + + +# **Role-Based Testing** + +- **Cluster Administrator** +Set up the Cluster Admin role for the cluster in Confluent Cloud. With this role, I am able to create and delete topics. + +![Image10](../assets/blog-images/oauth-oidc-blog/ClusterAdmin.png) + +- **Operator** +Set up the Operator role. This role allows you to view and describe topics but does not grant permissions to create or delete them. + +![Image11](../assets/blog-images/oauth-oidc-blog/Operatorrole.png) + +# **Breaking Down the OAuth 2.0 Flow** + +![Image12](../assets/blog-images/oauth-oidc-blog/oauth.png) + +Confluent OAuth uses the OAuth 2.0 protocol for authentication and authorization. OAuth is a widely-used standard that provides temporary access tokens to clients. These tokens allow clients to access and use Confluent Cloud resources and data on behalf of a user or application. + +## **Establish Trust Between Confluent and Your Identity Provider** + +**Add the Identity Provider:** + +- Ensure that the identity provider (e.g., Okta) is registered with Confluent Cloud. This usually involves providing some configuration details to Confluent Cloud. + +**Define the Type of Identity Provider:** + +- Specify whether the identity provider is OAuth 2.0 or OIDC (OpenID Connect). Each provider might have specific requirements for integration. + +**Create a Trust Relationship:** + +- This involves configuring Confluent Cloud to trust tokens issued by your identity provider. You might need to upload or specify the public keys or JWKS (JSON Web Key Set) URL from your identity provider. + +**Add Claims for Authentication and Authorization:** + +- Define which claims from the JWT will be used for authentication and authorization. Common claims include sub (subject), aud (audience), and custom claims like user roles or groups. + +## **Configure Your Identity Pool and Access Policy** + +**Identity Pool:** + +- Create an identity pool in Confluent Cloud, which groups external identities and assigns them access based on policies. You might need to configure mappings to ensure the correct access levels. + +**Access Policy:** + +- Define what resources the identities in the pool can access and what actions they can perform. + +## **Configure Clients** + +**Client ID and Client Secret:** + +- Obtain these from your identity provider. They are used to authenticate the client (Kafka producer/consumer) with the identity provider. + +**Client Credentials Grant:** + +- The client uses the Client ID and Client Secret to request an access token (JWT) from the identity provider. + +**Producer/Consumer Configuration Example:** + +Use the following Kafka client settings for OAuth 2.0 authentication: + +``` +log4j.logger.org.apache.kafka=TRACE +bootstrap.servers=pkc-p11xm.us-east-1.aws.confluent.cloud:9092 +security.protocol=SASL_SSL +sasl.oauthbearer.token.endpoint.url=https://trial-6662742.okta.com/oauth2/aush040p2xDZzZHcu697/v1/token +sasl.login.callback.handler.class=org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler +sasl.mechanism=OAUTHBEARER +sasl.jaas.config= \ + org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required \ + clientId='0oah03x6a2AjiQJZl697' \ + scope='api_scope' \ + clientSecret='QSG_xMfvaQ-Vw_i_3DzsfaHYg6K551p-Mfo7vNyMEcLDuY8E35ZUWIIivR4ZwgoQ' \ + extension_logicalCluster='lkc-37vn0o' \ + extension_identityPoolId='pool-mPqE'; +``` + \ +## **Validate the Token** + +**Confluent Cloud Token Validation:** + +Confluent Cloud validates the JWT received from the Kafka client. It checks the token against the trusted JWKS and verifies the claims to map to the appropriate authorization policy. + +**JSON Web Token (JWT) Example:** + +``` +{ + "ver": 1, + "jti": "AT.8p0wrTPUIm8yBHqgiKDrnJQ_32_MNc_FlkuqOjq8VCQ", + "iss": "https://trial-6662742.okta.com/oauth2/aush040p2xDZzZHcu697", + "aud": "confluent", + "iat": 1721739387, + "exp": 1721742987, + "cid": "0oah03x6a2AjiQJZl697", + "scp": [ + "api_scope" + ], + "sub": "0oah03x6a2AjiQJZl697" +} +``` +The JWT includes claims such as sub for the user ID, aud for the audience, and groups for any group memberships, which Confluent Cloud uses to determine access rights. + +# **Conclusion** + +OAuth 2.0 and Okta together offer a robust framework for managing secure and efficient access to your resources. By integrating Okta as your identity provider, you simplify the authentication process and enhance security, allowing you to control access through OAuth tokens and scopes. Configuring Okta with Confluent Cloud and your Kafka client ensures that your systems are protected from unauthorized access while maintaining smooth and manageable workflows. Embracing this setup not only strengthens your security measures but also optimizes the overall efficiency of your access management, providing a seamless experience for both administrators and users. + +# References + +- [https://developer.okta.com/docs/guides/authorization/](https://developer.okta.com/docs/guides/authorization/) +- [https://help.okta.com/en-us/content/topics/security/api_access.htm](https://help.okta.com/en-us/content/topics/security/api_access.htm) +- [https://confluent/idp](https://www.confluent.io/blog/configuring-azure-ad-ds-with-oauth-for-confluent/?_ga=2.45404768.551716354.1721102650-2109976434.1715347071&_gac=1.153683402.1721276426.CjwKCAjw1920BhA3EiwAJT3lSYsLUrc8CdJ17pl0KiBPfRTXVwM5AX8-ts-2V4-f-s1NnVp94tJETBoCE5AQAvD_BwE&_gl=1*rgtxzr*_gcl_aw*R0NMLjE3MjEyNzY0MjcuQ2p3S0NBancxOTIwQmhBM0Vpd0FKVDNsU1lzTFVyYzhDZEoxN3BsMEtpQlBmUlRYVndNNUFYOC10cy0yVjQtZi1zMU5uVnA5NHRKRVRCb0NFNUFRQXZEX0J3RQ..*_gcl_au*MjExOTgwMjgxNi4xNzE1MzQ3MDcx*_ga*MjEwOTk3NjQzNC4xNzE1MzQ3MDcx*_ga_D2D3EGKSGD*MTcyMTI5NDI4Mi4yNi4xLjE3MjEyOTgyMzUuMzcuMC4w#setting-up-azure-ad-as-an-idp) +- [https://docs.confluent.io/cloud/current/security/authenticate/workload-identities/identity-providers/oauth/overview.html#oauth-overview](https://docs.confluent.io/cloud/current/security/authenticate/workload-identities/identity-providers/oauth/overview.html#oauth-overview) diff --git a/_posts/2024-07-29-Tooling-and-Process-Lessons-from-running-Kafka-at-scale-for-mission-critical-customers.md b/_posts/2024-07-29-Tooling-and-Process-Lessons-from-running-Kafka-at-scale-for-mission-critical-customers.md new file mode 100644 index 0000000000..0abadf403a --- /dev/null +++ b/_posts/2024-07-29-Tooling-and-Process-Lessons-from-running-Kafka-at-scale-for-mission-critical-customers.md @@ -0,0 +1,253 @@ +--- +layout: post +title: "Tooling and Process: Lessons from running Kafka at scale for mission critical customers" +categories: [Apache Kafka, Mission-Critical Applications, High Availability Systems, Cluster Management, Performance Optimization, Operational Challenges, Kafka at Scale, SRE Practices, ] +teaser: Dive into our journey of deploying Apache Kafka at scale for fintech clients, navigating challenges and implementing solutions for mission-critical success +authors: Arun +featured: false +hidden: false +image: assets/blog-images/running-kafka-at-scale/kafka.png +toc: true +--- + +# **Introduction** + +In today’s digital economy, data is the lifeblood of business operations. Companies rely on real-time data processing to make informed decisions, optimize operations, and enhance customer experiences. Apache Kafka has emerged as a critical component in this ecosystem, enabling the efficient handling of vast amounts of data. However, running Kafka at scale for mission-critical customers presents unique challenges and opportunities. + +In our experience, we managed clients in the fintech domain with Kafka clusters on-premises, spread across more than 30 servers, in a highly controlled environment with stringent security requirements. Access to these systems was restricted to screen sharing, adding complexity to operations. + +This article explores the challenges faced, steps followed to mitigate them, and lessons learned from deploying and managing Kafka clusters at scale, focusing on tooling, processes, and best practices to ensure reliability, performance, and customer satisfaction. + + +# **Understanding the Kafka Ecosystem** + +Before diving into our experience, it's crucial to understand the Kafka ecosystem. Apache Kafka is not just a message broker; it’s a distributed streaming platform that consists of several key components: + + + +* **Producers:** Applications that publish messages to Kafka topics. +* **Consumers:** Applications that subscribe to topics and process messages. +* **Brokers:** Servers that store and manage data streams. +* **ZooKeeper:** A distributed coordination service that manages Kafka brokers. +* **Connector**: Seamlessly connect data with Kafka. + + +## **General Cluster Architecture:** + +![architecture.png](../assets/blog-images/running-kafka-at-scale/architecture.png) + +**Clusters in Primary Site:** Handling the core operational data processing, ensuring high availability and performance for critical business functions. + +**Clusters in Backup Site:** Acting as a backup to the Primary clusters, ready to take over in case of failures, with real-time data replication ensuring consistency and reliability. + +**Clusters in PreProd:** Focusing on testing, development, and non-functional requirements, allowing for safe experimentation and feature validation without impacting production workloads. + + +# **Challenges Faced:** + + +## **Periodic Broker Failure:** + +One of the first issues we encountered was the periodic failure of a broker within a particular Kafka cluster. The broker logs clearly indicated that the failures were caused by a _"Too Many Files Open"_ error, which implied that the file descriptor limit was being reached, preventing the broker from opening new files and thus leading to failure. It's important to understand that each client connection uses a file descriptor, and each segment file also consumes one. + +The initial hotfix for this issue was to restart the affected server, which temporarily resolved the problem. However, the issue would reoccur in the same cluster after a few days, so it was crucial to identify the root cause of the open file leak. + +We observed that the client was deleting all topics at the end of each day and recreating them for the next day. This process created temporary open files, leading to zombie file descriptors accumulating over time. + +To resolve the issue permanently, we increased the open file descriptor limit to a much larger number than required for the Kafka cluster's workload. This higher limit accommodated the zombie file descriptors until Kafka itself cleaned them up after the retention period. + + +## **A Problem Isn't a Problem Until It Becomes a Business Problem:** + +One of the main observations we had with this customer was that issues were initially not brought to our attention because the customer was unaware of them, as they weren’t yet affecting their production application, although they had the potential to do so. Since we didn’t have direct access to the customer environment and were dependent on the customer to screen share and grant us access, we came up with the idea of conducting periodic health checks of the Kafka clusters in the client environment. In fact, we identified so many potential issues during these checks that the client requested us to perform a health check every day before the start of business hours. + + +### **Major Issues Discovered:** + + +#### **Failed Replicators in Backup Site:** + +Our client had a process of deleting and recreating topics after processing all the messages within them. This approach led to an issue with the Replicator, which was responsible for copying data from the Kafka clusters in the production site to the backup site. + +Due to the client’s reliance on topic whitelists, there was a potential risk of replication failure if these whitelisted topics were missing from the primary site during cleanup activities. + +The client initially overlooked this failed replication because it did not affect their active application. However, it posed a significant risk of data loss if there was ever a need to failover to the backup site. + +This issue was identified and mitigated by advising the client to pause the replicator before initiating the cleanup process and only restart it after all topics were deleted and recreated in the primary site. + + +#### **Connect log not being captured:** + +We encountered an issue where the Connect worker logs were not being written after a few days in the Kafka cluster with the highest usage. This problem was first noticed when one of the connector tasks failed. + +It was observed that the volume of transactions was so high that the logs generated quickly filled up the servers disk space, causing the log-writing task to fail. It's important to note that Replicators use memory for replication instead of disk space allowing them to function as expected. Since Logs are of paramount importance for troubleshooting if any issue happens and this issue was taken on priority by us. + + +Though increasing disk space was a straightforward solution, it wasn’t feasible for all servers. Instead, we implemented a solution using Kafka-CP-LogMonitor, which aggregates logs for Kafka components in one location and backs up older log files in a data warehouse. Once Kafka-CP-Logmonitor was set up a cron job was run on the server to automatically delete older log files, ensuring there was always sufficient disk space available for new logs. + + +#### **Replicator failure due to OOM Exception:** + +We once encountered a peculiar issue where the replicator failed with an Out of Memory (OOM) Exception, despite the fact that no messages were being replicated at the time, and there was plenty of free space available on the server. + +The root cause of this issue was that the JVM memory allocated to the replicator exceeded the available memory on the server. This occurred because the infrastructure team had reduced the server's memory allocation, assuming it would be sufficient. + + +#### **Missing Connect Cluster:** + +Early one morning, we received an urgent call informing us that the entire Connect cluster was missing from one of the Kafka clusters in the Backup Site. This was a critical issue because the absence of the Connect cluster meant that no data replication to the Backup Site was occurring. + +Logs revealed an issue with the embedded Jetty server which was not starting, likely due to a corrupt JAR file. This was perplexing because no changes had been made to the server by us or any other team, as all server modifications were typically performed over the weekend, and this occurred mid-week. + +During troubleshooting, we discovered that a team had performed some Java-related patching a few days earlier. When the server restarted on this particular day, the Connect worker failed to start because the embedded Jetty server couldn't initialize due to the corrupt JAR file. + +Further investigation revealed that this issue also affected other Kafka servers in the Backup Site. We resolved the problem by uninstalling OpenJDK and any conflicting packages, followed by reinstalling the correct version of OpenJDK on all the affected servers using the following commands: + + +```bash +# Uninstall OpenJDK +rpm -qa | grep openjdk | xargs yum -y remove yum remove java + +# Remove conflicting packages +yum remove + +# Reinstall correct OpenJDK +rpm -Uvh --force openjdk-rpm/*.rpm +``` + + + +#### **Data Replication Lag:** + +There used to be a huge replication lag (~15 Million) in one of the kafka clusters in Backup SIte during peak business hours. This issue primarily arose due configuration within this cluster, where topics were set to have only one partition, preventing them from utilizing Kafka's parallel processing capabilities. This configuration was necessary to maintain message ordering, as the client was not using message keys. + +We advised the client to increase the partition count of the topics and use an instrument ID from the messages as a key. This approach would ensure that messages with the same instrument ID would be routed to the same partition, thereby maintaining order. However, this change required load testing in the PreProd environment before implementation in the production environment. + +As a temporary hotfix, we performed stepwise optimizations on the replicator configuration, with periodic monitoring of the lag to evaluate the effectiveness of the changes. The following configurations were adjusted: + + +```bash +src.consumer.max.poll.records = incremental changes from 100000 to 800000 +src.consumer.fetch.min.bytes = incremental changes from 100000 to 800000 +producer.override.batch.size = incremental changes from 100000 to 800000 +``` + + +As a result of these optimizations, the replication lag was reduced from approximately 15 million messages to fewer than 100 messages during peak hours within a week, as illustrated in the shared graph. + +![graph.png](../assets/blog-images/running-kafka-at-scale/graph.png) + + +#### **Time consuming Post Kafka consumption Process:** + +One of the teams was experiencing delays in processing messages after consuming them from Kafka, due to time-consuming filtering and join operations before sending the processed data to the database. They were performing join operations by fetching data from a local cache, which significantly slowed down the process. + + \ +A work around to this was suggested, i.e to load data from the local cache into a Kafka topic via a source connector, and perform the join and filtering operations within the Kafka cluster using KSQL. Finally, use Kafka connectors to load the processed data into the database. This solution streamlined the workflow, making it nearly real-time. + + +#### **Ansible Inventory files not found:** + +While setting up the operational tool CP-JMX-Monitor (explained in detail later), one of the requirements was to enable Kafka components to expose JMX metrics in a format readable by Prometheus. + +Since all Kafka clusters were configured using Ansible, we needed to update the Ansible inventory file with the following configuration and perform a rolling update: + + +```bash +all: + vars: + # ... + jmxexporter_enabled: true + jmxexporter_url_remote: false + jmxexporter_jar_url:jmx_prometheus_javaagent-0.12.0.jar +``` + + +Unfortunately, the Ansible inventory file used to set up the Kafka cluster was missing from the client environment. + +To mitigate this issue, we considered the following options: + + + +1. Manually Expose JMX Metrics from Each Kafka Component: + * This approach was cumbersome and prone to human error, making it impractical. +2. Rewrite the Ansible Inventory File Manually: + * This option was risky, as any discrepancies in the inventory file could lead to undesired changes in the Kafka cluster. +3. Run [Ansible discovery Scripts](https://github.com/confluentinc/cp-discovery): + * These scripts would check the Kafka cluster and automatically rewrite the inventory file. The only requirement was a host.yml file. + +Given these options, we chose the third option to recreate the lost inventory files using the ansible discovery scripts. + + +# **Custom Tooling for Kafka Management :** + +It is well known that prevention is better than mitigation. Therefore, early detection and resolution of anomalies in a Kafka cluster is crucial. With this in mind, we designed and implemented operational tools for managing, monitoring, and securing Kafka clusters. + +Below are the custom tooling solutions we developed specifically for this customer: + + +## **Kafka-CP-Log-Monitor** + +**"Kafka-cp-log-monitor"** is a specialized tool designed to monitor and analyze Kafka component logs in one place. It leverages the ELK stack (Elasticsearch, Logstash, and Kibana) with Filebeat configured on each component to collect and push logs to Logstash. The logs are then parsed according to predefined configurations and made available in the Kibana UI for users to view and analyze. + +This tool addresses several challenges commonly faced when analyzing Kafka component logs: + + + +* **Access Permissions:** Sufficient permissions were required to SSH into component servers to access logs. "kafka-cp-log-monitor" eliminated this need by aggregating logs in a centralized location. +* **Time Overhead:** Fetching or viewing logs spread across multiple servers can be time-consuming. This tool reduced the overhead by providing a unified interface to access logs from different servers. +* **Log Searchability:** Searching for errors using `grep` in large log files is cumbersome. With "kafka-cp-log-monitor," logs are searchable based on component type, hostname, log level, and more, significantly speeding up the triage process. + +Here is a demo of [Kafka-CP-Log-Monitor](https://www.youtube.com/watch?v=rWhrKLZ8jSg). + +[![Kafka-CP-Log-Monitor](../assets/blog-images/running-kafka-at-scale/logmonitor.png)](https://www.youtube.com/watch?v=rWhrKLZ8jSg) + +## **Kafka-cp-jmx-monitor** + +**"kafka-cp-jmx-monitor"** is a comprehensive monitoring tool designed to capture and visualize Kafka's JMX metrics, providing insights into the health and performance of Kafka clusters. This tool enables us to: + + + +* **Capture JMX Metrics:** Kafka exposes a wealth of JMX metrics critical for smooth cluster operations. "kafka-cp-jmx-monitor" captures these metrics and renders them as meaningful graphs in customizable dashboards. +* **Prometheus and Grafana Integration:** Prometheus scrapes the exposed JMX metrics, and Grafana builds dashboards by querying the Prometheus database. Grafana also supports configuring alerts on the queried metrics. +* **Advanced Alerting:** The tool supports advanced alerting based on anomaly detection, rate of change, etc., enabling proactive issue identification. It provides an in-depth view of trend analysis, capacity planning, and potential issue detection. +* **System Metrics Monitoring:** In addition to Kafka JMX metrics, the tool also monitors system metrics of different Kafka component servers by deploying a node-exporter service on the servers. + +The implementation of "kafka-cp-jmx-monitor" gave us a comprehensive understanding of the cluster's operational health, enabling us to predict and address potential issues before they escalated into significant problems. By visualizing trends and patterns in Kafka's performance, we gained valuable insights for capacity planning and resource allocation. Additionally, we configured alerts to detect anomalies at both the cluster and individual node levels, allowing us to avoid many major potential issues and drastically reduce the occurrence of problems across the cluster. + +Here is a demo of [Kafka-cp-jmx-dashboard](https://www.youtube.com/watch?v=1Mr2iy2RkA8). + +[![Kafka-cp-jmx-Monitor](../assets/blog-images/running-kafka-at-scale/jmxmonitor.png)](https://www.youtube.com/watch?v=1Mr2iy2RkA8) + + +## **Kafka-cp-deploy-manager** + +Efficient deployment and configuration management are fundamental to running Kafka at scale. Automating these processes reduces human error and accelerates the time-to-market for new features and updates. + +**"kafka-cp-deploy-manager"** automates Kafka cluster lifecycle management, handling version upgrades, configuration changes, and scaling operations seamlessly. Key features include: + + + +* **State Management:** The tool maintains the state of the Kafka cluster at any point, enabling automated deployments to achieve the desired state. +* **Version Control:** State files of the Kafka cluster are maintained in a GitHub-like version control system, allowing easy rollbacks in case of issues. +* **Continuous Deployment with Jenkins:** Jenkins handles continuous deployment on state changes, abstracting deployment complexity with simplified configurable variables in an interactive UI. + +**Deployment Benefits:** + + + +* **Consistent and Reproducible Pipelines:** Ensures uniformity in deployment processes. +* **Error Mitigation:** Reduces human errors during deployment. +* **Smooth Upgrades and Rollbacks:** Facilitates versioned changes to state files, ensuring stability. +* **Privilege Isolation:** Segregates write and execute privileges, allowing admins to manage state files while developers handle deployments. + +With "kafka-cp-deploy-manager," we achieved consistent deployment pipelines, minimizing errors and ensuring efficient version upgrades and rollbacks. The tool's integration with Jenkins allowed us to streamline the deployment process, enabling faster response times to changes and ensuring the clusters remained aligned with business needs. + +Here is a demo of [Kafka-cp-deploy-manager](https://www.youtube.com/watch?v=oRAipiWWIDg). + +[![Kafka-cp-deploy-Monitor](../assets/blog-images/running-kafka-at-scale/deploymentmanager.png)](https://www.youtube.com/watch?v=oRAipiWWIDg) + + +# **Conclusion** + +Running Kafka at scale for mission-critical customers is both complex and rewarding. By mastering the Kafka ecosystem and adhering to best practices in sizing, tooling, deployment, and performance optimization, organizations can fully leverage Kafka’s potential. Since inception, we have maintained 100% cluster availability, a feat made possible by our dedicated team and the deployment of custom tooling solutions. These tools have significantly enhanced our monitoring and management capabilities, contributing to a more efficient and reliable data streaming infrastructure. \ No newline at end of file diff --git a/_posts/2024-07-29-Why-Real-time-API-Monetization-is-the-need-of-the-day.md b/_posts/2024-07-29-Why-Real-time-API-Monetization-is-the-need-of-the-day.md new file mode 100644 index 0000000000..b9d90e66fa --- /dev/null +++ b/_posts/2024-07-29-Why-Real-time-API-Monetization-is-the-need-of-the-day.md @@ -0,0 +1,201 @@ +--- +layout: post +title: "Why Real-time API Monetization is the need of the day?" +authors: ashwin +categories: [Data, SaaS, API Metering, Billing, Real-time, Cost, Product] +image: assets/blog-images/apinomy_blog/ApinomyBlogTitle.webp +featured: false +hidden: false +teaser: Discover how real-time API monetization can unlock new revenue streams and drive innovation in your organization. +toc: true +--- + +# Introduction + +APIs (Application Programming Interfaces) have become one of the most important components of modern digital applications. They allow developers to quickly and easily integrate different services into their applications, making them more powerful and efficient. In a constantly evolving digital economy, APIs are necessary to continuously enable new services, but the need for digital-first strategies have made APIs more of a priority than ever before. Every organization is confronted by the need to make changes fast and adapt to new ways of conducting their business. APIs streamline this process of transformation. + +# APIs Beyond Plain Old Integration + +In the early days, APIs (Application Programming Interfaces) served a straightforward purpose: to enable communication between frontend and backend systems or to facilitate information exchange between internal services. However, as technology has advanced and businesses have evolved, so too have APIs. Today, they are the backbone of complex interactions across distributed systems, supporting real-time data processing, microservices architectures, and seamless connectivity with third-party services and platforms. This evolution has transformed APIs from simple integration tools into critical enablers of digital transformation. + +## The Natural Evolution of APIs: From Endpoints to Marketplaces + +The journey of APIs can be mapped out in four distinct stages: + +1. **Endpoints**: Initially, APIs were mere endpoints—interfaces that allowed different software applications to communicate. +2. **Products**: As the benefits of APIs became more apparent, they began to be developed as products. These productized APIs were designed with a focus on usability, documentation, and developer support. Companies started to offer APIs as standalone products that could be consumed by external developers to build new applications and services. +3. **Platforms**: The next stage in the evolution saw APIs becoming integral to platform-based business models. Companies like Amazon, Google, and Facebook built extensive ecosystems around their APIs, enabling third-party developers to create applications that extended the functionality of their platforms. This shift transformed APIs into powerful tools for ecosystem building and innovation. +4. **Marketplaces**: Today, APIs are at the heart of digital marketplaces. These marketplaces provide a centralized location where developers can discover, evaluate, and purchase APIs from various providers. By monetizing their APIs, companies can generate new revenue streams and foster a thriving developer community. + +## Company as a Service + +In the current digital landscape, companies are leveraging APIs to offer their core capabilities as services. This "company as a service" (CAAS) model allows businesses to expose their internal processes and data to external developers, partners, and customers through APIs. By doing so, they can create new business opportunities, improve operational efficiency, and drive innovation. + +For instance, a logistics company might expose its shipment tracking API to e-commerce platforms, enabling real-time tracking for customers. Similarly, a financial institution could offer APIs for payment processing, account management, and fraud detection, allowing fintech startups to build innovative solutions on top of their infrastructure. + +## The API Economy: High Valuations and Strategic Acquisitions + +The evolution of the API economy is evident in the high valuations of companies like Stripe and Solarisbank. These companies have built their success on the strength of their APIs, offering robust and reliable services that are easy to integrate and use. Stripe's APIs, for example, have revolutionized online payments, making it simple for businesses of all sizes to accept payments over the internet. + +Established companies in major economic sectors, particularly fintech, are also recognizing the power of APIs. They are either building platform-based business models that leverage APIs to become leaders in their space or acquiring third-party companies to expand their API-based services. This strategic approach enables them to drive innovation, enhance customer experiences, and stay competitive in a rapidly changing market. + +# The Federated Future of API Management + +While HTTP-based APIs have traditionally dominated the conversation, the reality is that APIs are not restricted to a single protocol. Data APIs, along with various other protocols and interfaces, have become widespread, reflecting the diversity and complexity of the systems they serve. As data emerges as the most valuable asset for organizations, managing these APIs has become both more critical and more challenging. + +## The Expanding Universe of APIs + +APIs today are not just about HTTP. They encompass a wide range of protocols and interfaces, each serving specific purposes and optimizing different aspects of data and service interaction. Here are a few examples: + +- **gRPC**: An open-source remote procedure call (RPC) system that allows clients and servers to communicate transparently. +- **WebSockets**: A protocol providing full-duplex communication channels over a single TCP connection, ideal for real-time applications. +- **PostgreSQL**: Not just a database but also a protocol for managing relational data. +- **S3**: Amazon's Simple Storage Service, which has its own API for object storage. +- **HDFS**: The Hadoop Distributed File System, essential for managing large-scale data storage across clusters. +- **Cassandra**: A distributed NoSQL database with its own API for handling large amounts of unstructured data. + +## The Heterogeneous Interface Landscape + +The interface for exposing services has become highly heterogeneous. This diversity is not limited to the protocols mentioned above but extends across various deployment environments: + +- **Public Clouds**: Services hosted on platforms like AWS, Azure, and Google Cloud. +- **Private Clouds**: Dedicated cloud infrastructure managed by organizations. +- **On-Premises**: Traditional data centers still play a crucial role in many industries. +- **Edge Computing**: Computing resources located closer to the data source to reduce latency and improve performance. + +This heterogeneity in interfaces and deployment environments presents a significant challenge: managing this sprawling landscape with a single, monolithic API management system is impractical and inefficient. + +## Beyond Monolithic API Management + +The classic approach to API management, which often revolves around HTTP proxies and centralized cataloging, is increasingly inadequate for today's needs. Here’s why a federated approach is essential: + +1. **Decentralized Control**: In a federated system, each API can be managed independently, allowing for greater flexibility and responsiveness. This is particularly important in hybrid environments where services may span public clouds, private clouds, and on-premises infrastructure. +2. **Protocol Agnosticism**: A federated system can support a wide range of protocols, ensuring that the most appropriate and efficient protocol is used for each specific case. This avoids the one-size-fits-all approach of traditional systems. +3. **Scalability**: Managing APIs in a decentralized manner allows organizations to scale their API infrastructure organically. Each service or application can be scaled independently, optimizing resource usage and performance. +4. **Resilience and Redundancy**: A federated approach enhances system resilience by avoiding single points of failure. If one API endpoint or management system goes down, others can continue to operate unaffected. + +## Embracing the Sprawl + +Given the decentralized nature of modern API ecosystems, the concept of a central catalog is becoming outdated. Instead, organizations should focus on creating interconnected catalogs and discovery mechanisms that can operate across diverse environments. This can be achieved through: + +- **Service Meshes**: Tools like Istio or Linkerd that manage microservices communication, providing visibility and control over diverse API traffic. +- **API Gateways**: Deploying multiple API gateways closer to where the services reside, whether in the cloud, on-premises, or at the edge. +- **Federated Catalogs**: Implementing catalogs that can sync and interoperate, providing a comprehensive view of APIs without centralization. + +# Unlocking the Full Potential of Your API Program + +The technical aspects of API development are well understood but the business potential of APIs remains underexploited. According to a 2023 Kong Customer Survey, 46% of developers say senior leadership lacks a strong understanding of the value of APIs. This divide can have implications for organizations striving to harness the full potential of APIs by embracing an [API-first approach](https://konghq.com/resources/reports/unlocking-the-api-first-operating-model). It's time to elevate API productization and monetization to a first-class feature of API offerings. + +## Revenue: The Ultimate Measure of API Success + +When evaluating the success of an API program, **revenue** stands out as the most definitive metric. In the business world, Profit and Loss (P&L) is the language that speaks volumes about the effectiveness of any initiative. For API programs, revenue generation should be a central focus, not an afterthought. However, most API management platforms currently overlook this crucial aspect. + +## The Overlooked Aspects in API Management Platforms + +API management platforms provide tools for API creation, deployment, and monitoring but often fall short in offering robust features for productizing and monetizing APIs. To bridge this gap, API product managers need a comprehensive approach that considers the following factors: + +### 1. A Unified View Across Your API Real Estate + +To effectively productize APIs, it's essential to have a holistic view of your entire API ecosystem. This includes discovering and cataloging all available APIs, understanding their usage patterns, and identifying opportunities for monetization. A comprehensive API inventory is the first step towards a successful monetization strategy. + +### 2. A First-Class POV on Developer Experience (DevX) + +Developer Experience (DevX) is a critical factor in the success of any API product. A positive DevX ensures that developers can easily discover, understand, and integrate APIs into their applications. This involves providing clear documentation, robust SDKs, and responsive support. Investing in DevX not only attracts more developers but also increases the likelihood of successful API adoption and monetization. + +### 3. Empirical Product Market Fit: Will People Even Use It? + +Before monetizing an API, it's crucial to validate its product-market fit. This involves conducting market research to understand the needs and pain points of potential users. By gathering empirical data and feedback from developers, API product managers can refine their offerings to better meet market demands. Understanding whether there is a genuine need for your API is a prerequisite for successful productization. + +### 4. Price Discovery: Will People Pay for It? + +Once product-market fit is established, the next challenge is price discovery. Determining the right pricing model for your API is critical to maximizing revenue. This could involve usage-based pricing, subscription models, or tiered pricing plans. Conducting market research and competitive analysis can help identify the optimal pricing strategy. Additionally, offering flexible pricing options can cater to different segments of your target audience, increasing the likelihood of monetization success. + +## The Growing Market for API Monetization Platforms + +The importance of API monetization is underscored by the impressive growth projections for the API monetization platform market. [FMI](https://www.futuremarketinsights.com/reports/api-monetization-platform-market) predict a compound annual growth rate (CAGR) of 28.1% during the forecast period 2023-2033, with the industry expected to generate US$ 6.1 billion in 2023 and exceed US$ 72.6 billion by 2033. This rapid growth reflects the increasing reliance on APIs by large companies, with approximately 40% of them using hundreds of APIs in their daily operations. Furthermore, an estimated 71% of developers plan to leverage APIs to a greater extent in the coming years. + +According to [Kong API Impact Report](https://konghq.com/resources/reports/economic-impact-of-apis), **by 2027, APIs are projected to contribute a staggering $14.2 trillion to the global economy.** That’s a projected $3.3 trillion growth in market size from 2023 — a difference greater than the current GDP of the United Kingdom or France. + +# API Monetization: Key Considerations for Success + +Here are key considerations for effectively monetizing your API: + +## Metering Layer + +The API layer can capture and collect data on API usage, performance metrics, and error handling for monitoring, analytics, generating insights about API usage patterns, identifying bottlenecks, or optimizing system performance. So, an API layer remains the most ideal place to record and meter transactions. Ensuring accurate and asynchronous recording of metrics is crucial. This allows for precise tracking of usage, which is essential for billing and analyzing customer behaviour. + +## Choosing the Right Usage Metric + +Selecting the appropriate usage metric is vital for aligning your API's value with customer needs. The business capability delivered by the API should be quantified in terms that are meaningful and self-evident to the user. Instead of focusing solely on API call volumes, maybe consider metrics such as: + +- **Transaction volume:** Number of transactions processed. +- **Data volume:** Gigabytes of data sent or received. +- **Unique users:** Number of distinct users accessing the API. + +Imagine this: billing based on the number of sequences stored in an email marketing tool might not be logical. Why? Customers might keep deleting sequences after sending them. A smarter approach? Gauge the number of unique contacts emailed every month or the total emails sent out. + +The choice of metric should depend on the specific product or service offered and the typical behaviour of your customers. + +## Billing Model: Prepaid vs. Postpaid + +Choosing between prepaid and postpaid billing models involves understanding your customers' preferences and cash flow considerations. + +- **Prepaid:** Customers pay upfront for a set amount of usage. This model is straightforward and ensures immediate revenue, but may require customers to estimate their usage accurately. +- **Postpaid:** Customers are billed based on actual usage after the fact. This can be more flexible and attractive to customers, but poses a risk of delayed payments and bad debt. + +## Pricing Strategy: Tiered Pricing vs. Pay As You Go + +A well-thought-out pricing strategy can significantly impact your API's adoption and profitability. + +- **Tiered Pricing:** Offers different pricing tiers based on usage levels. This model can cater to a wide range of customers, from small startups to large enterprises. It provides predictability and scalability. But sometimes, the price doesn't resonate with what your customers feel they're getting. Too many tiers? They might just freeze, unsure of which to pick. +- **Pay As You Go (Usage based billing):** Charges customers based on actual usage. This model is highly flexible and can be appealing to businesses that prefer to pay for only what they use. It can also encourage more usage as there are no upfront commitments. + +## Invoice Methods: Recurring Invoicing vs. Threshold-Based Invoicing + +Invoicing methods should align with your billing strategy and customer preferences. + +- **Recurring Invoicing:** Bills customers at regular intervals (e.g., monthly, quarterly). This method is predictable and can simplify budgeting for both the provider and the customer. +- **Threshold-Based Invoicing:** Bills customers once they reach a certain usage threshold. This can be effective for usage-based models, ensuring that customers are only billed when they have consumed a significant amount of the service. + +## Focus on Business problems rather than technical problems + +A common challenge for SaaS and PaaS companies is focusing on building metering and billing systems, often at the expense of identifying and pricing their API products effectively. + +- **Focus on Core Strengths:** Rather than developing complex metering and billing UIs, leverage third-party solutions that specialize in these areas. This allows your team to focus on refining the API products themselves. +- **Market-Driven Solutions:** Allow the market to choose the best experience layer, rather than trying to build a one-size-fits-all solution. This can lead to better user experiences and more robust solutions. Don’t cannibalize the customers. +- **Economy of Scale:** By focusing on your core strengths and leveraging specialized providers for billing and metering, you can achieve better economy of scale and allocate resources more efficiently. + +# Apinomy - API (Data) Monetization at Scale + +**Apinomy** enables direct monetization of your APIs and Data, at scale. Helps you in enabling internal, external and 3rd party developers to subscribe to your APIs and Data, with unlimited flexible plans, designed by you. + +![ApinomyBlog1.png](../assets/blog-images/apinomy_blog/ApinomyBlog1.png) + +Apinomy is not an API management platform by itself, but it supports the majority of API platform providers, including Kong, Apigee, Envoy, AWS API Gateway and more. It is simple enough for anyone to build a gateway plugin, based on the capabilities of the gateway. + +Apinomy works by integrating with your gateway to do the following things: + +1. Discover APIs (that is endpoints: hosts, routes, even upstream services and event OpenAPI specs, based on support) +2. Then, it allows you to bundle these APIs into meaningful products. +3. Then, you can create attractive usage based billing models. General models include tiered pricing based on API call volumes, but Apinomy also allows you to define your custom service units. For example, you could define an API plan to be billed against a custom metric, such as SMS API to be billed against number of messages sent, or a payment transaction to be billed against the value of the transaction. +4. Your users can either subscribe to these products through the available rate plans from the default portal that Apinomy provisions, or can also be integrated with your own portal. Rate plans can be prepaid for a licensed usage quota as well. +5. Apinomy, collects logs from the gateway in near real-time and aggregates these in real-time to generate usage reports as well as billing to the bound of a minute. +6. Apinomy seamlessly integrates with the billing solution of your choice: Stripe, Chargebee etc. + +Infact, this is only a fraction of what you can achieve with Apinomy. Over and above this, you can also use Apinomy as an ecosystem platform, to organize your APIs into multiple domains, products and to govern your ecosystem at large. + +To know more about how Apinomy works, take a look the [demo](https://www.youtube.com/watch?v=elNCHQDMn0E) video. + +![ApinomyBlog2.png](../assets/blog-images/apinomy_blog/ApinomyBlog2.png) + +We believe Apinomy will be useful for a variety of data & API monetization use-cases at large, but it also usable for other concerns such as, + +- **SaaS usage based billing** - Bill tenants based on API calls. Simplify Fin-ops with tenant level insights on usage against revenue +- **Chargeback for Internal Platforms** - Gain insight into BU + charge-code level usage for shared services SaaS and COTS platforms +- **API Marketplace** - Run your own API Marketplace with API producers, developers (internal, external, partners) as constituents enabling various monetization models + +Future versions of Apinomy will also support data monetization around the Postgres protocol. We will have more exciting news. But for now, Apinomy provides very exciting opportunities in real-time billing for APIs and is the only multi-gateway, multi-cloud usage billing product. This enables creating a comprehensive monetization and revenue plane for your APIs. + +# Conclusion + +Real-time API monetization is critical for organizations looking to unlock new revenue streams and drive innovation. With tools like **Apinomy,** businesses can effectively manage, productize, and monetize their APIs across multiple platforms and environments. By focusing on meaningful metrics, flexible billing models, and a strong developer experience, companies can harness the full potential of their API programs and stay competitive in a rapidly evolving market. As the API economy continues to grow, businesses that embrace real-time monetization will be well-positioned to capitalize on emerging opportunities and sustain long-term success. \ No newline at end of file diff --git a/_posts/2024-07-30-Configuring-OAuth-and-OIDC-for-Confluent-Cloud-using-Auth0.md b/_posts/2024-07-30-Configuring-OAuth-and-OIDC-for-Confluent-Cloud-using-Auth0.md new file mode 100644 index 0000000000..4a5dab2be6 --- /dev/null +++ b/_posts/2024-07-30-Configuring-OAuth-and-OIDC-for-Confluent-Cloud-using-Auth0.md @@ -0,0 +1,213 @@ +--- +layout: post +title: "Authentication to Confluent Cloud Using OAuth/OIDC with Auth0 as Identity Provider" +categories: [Kafka Architecture, Distributed Systems, Security] +author: Vikhyat +teaser: Unlock the power of secure authentication and authorization by integrating Auth0 with Confluent Cloud. In this guide, we'll walk you through configuring OAuth 2.0 and OIDC, ensuring a seamless and secure login experience for your Kafka clusters. Whether you're looking to streamline user management or enhance security, this step-by-step tutorial has you covered. +image: assets/blog-images/auth0-integration_blog/Web_Photo_Editor.jpg +toc: true +--- + +# Configuring OAuth/OIDC for Confluent Cloud using Auth0 + +In the early days of the internet, sharing information between services was straightforward but risky. Users often had to share their usernames and passwords, exposing their credentials to potential misuse. Today, we have secure standards like OAuth 2.0 and OpenID Connect (OIDC) that make this process safer and more efficient. + +The popularity of OAuth and OIDC for authentication and authorization says a lot about the importance of the technology. If you see a login page online on a modern website, it is most likely configured using OAuth 2.0 and OpenID Connect (OIDC). These protocols have become the standard for secure, robust authentication and authorization across various applications and services. + + +## The Problem with Sharing Credentials + +Back in the day, if you wanted a service to access your information on another platform, you had to share your username and password. This practice was insecure for several reasons: + +- No guarantee that the service would keep your credentials safe. + +- No control over how much of your personal information the service could access. + +Thankfully, standards like OAuth 2.0 have been developed to address these security concerns. + + +## Understanding OAuth and OIDC: A Simple Example + +OAuth and OIDC work together to make secure logins easy and seamless. Imagine you want to log in to a new app using your Google account. When you click "Log in with Google," OAuth handles the authorization by asking Google for permission to share your info with the new app. If you agree, OIDC comes into play, providing the app with your identity details, such as your name and email address, so you can log in without creating a new account. This way, OAuth ensures your data stays secure, and OIDC confirms your identity. + + +
    + Image1 +
    + + +For more information on OAuth and OIDC go through this beautifully explained blog post from Okta: [An Illustrated Guide to OAuth and OpenID Connect](https://developer.okta.com/blog/2019/10/21/illustrated-guide-to-oauth-and-oidc) + + +## Scope of this blog + +In this blog post, we will walk through the steps of configuring Auth0 as an identity provider for Confluent Cloud. Auth0 is a flexible, drop-in solution to add authentication and authorization services to your applications. Confluent Cloud is a fully managed event streaming platform based on Apache Kafka. By integrating Auth0 with Confluent Cloud, you can enhance the security of your Kafka clusters and streamline the user authentication process. + + +## Configuring Auth0 as an identity provider in Confluent Cloud + +### Prerequisites: + +1. A Confluent Cloud account. + +2. An Auth0 account. + +3. Administrative access to both platforms. + + +### Steps: + +#### 1. Create a New Application in Auth0 + +1. ##### Log in to Auth0: + + 1. Go to your Auth0 dashboard. + +2. ##### Create a New Application: + + 1. You can see a Create Application button on the getting started page as shows in this picture or alternatively you can go to the application section on the left side of the screen and create an application from there + + Image2 + +2. Both options should land you in this page where you give your application details. Here you give your application name, and select Machine to Machine under application type.  + +3. Below this there is an option to select an API. I will be selecting the default Auth0 Management API in this example. But you can create your own API using the APIs section from the menu. + + Image3 + +4. Once we've selected the API, now we can select the permissions (scopes) we want this API to use. In this case, I am selecting all permissions. + +5. Click on continue to successfully create the Application + + Image4 + +3. ##### Check Application Settings: + + 1. In the application settings, take note of the Client ID and Client Secret. + + Image5 + + +#### 2. Configure Auth0 as an Identity Provider in Confluent Cloud + +1. ##### Log in to Confluent Cloud: + + 1. Go to your Confluent Cloud dashboard. + +2. ##### Navigate to Identity Providers: + + 1. Go to the Accounts and access section + + 2. Navigate to the Workload Identities tab + + Image6 + +3. ##### Add a New Identity Provider: + + 1. Click on Add Identity Providers. + + 2. Choose Other OIDC Provider as the provider type. + + Image7 + +4. ##### Fill in the Identity Provider Details: + + 1. Fill in the Name, Description, OIDC Discovery URL, Issuer URI, JWKS URI as shown in the picture below. + + Image8 + +2. You might be wondering where to find these details. Well, the OIDC Discovery URL can be found under your application settings mentioned as Domain + +3. The issuer URI and and JWKS URI can be found in the OpenID Connect (OIDC) discovery documents found here: [`https://{yourDomain}/.well-known/openid-configuration`](about:blank) + + 1. In my case, it looks something like the image shown below + + Image9 + +4. Once this is filled, click on Validate and Save + +5) **Add Identity Pool**: + + 1. Once the Identity provider is created, we need to create an Identity pool. Cick on Add Identity Pool button as shown below + + Image10 + + 2. Fill in the details as per your requirement. I have filled it as shown in the picture. + + Image11 + + 3. You can use an identity pool to provide granular control over access of your applications to your Confluent Cloud resources. An identity pool is a group of external application identities that are assigned a certain level of access based on a claims-based policy + + For details on identity pool and how to use it, Check out [Use Identity Pools with Your OAuth/OIDC Identity Provider on Confluent Cloud](https://docs.confluent.io/cloud/current/security/authenticate/workload-identities/identity-providers/oauth/identity-pools.html#add-oauth-identity-pools). + + + 4. Click next once you are done with populating the fields. + + 5. Since we are creating everything from scratch we will select Add new permissions as shown below and hit next. + + ![Image6](../assets/blog-images/auth0-integration_blog/Identity-pool2.png) + + 6. On the next page, select the cluster you want to and give the permissions required and finish the setup. + +#### 3. Configuring Kafka Client + +1. Open your terminal from where you are trying to access the confluent cloud and where your client is present. + +2. Create a client configuration, in my case I have named it client.properties + +3. Use the following template to fill in your details  + +``` +# Kafka Broker Connection +bootstrap.servers=YOUR_BOOTSTRAP_SERVER + +# Security Protocol +security.protocol=SASL_SSL + +# OAuth2 Token Endpoint URL +sasl.oauthbearer.token.endpoint.url=YOUR_TOKEN_ENDPOINT_URL + +# Login Callback Handler Class +sasl.login.callback.handler.class=org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler + +# SASL Mechanism +sasl.mechanism=OAUTHBEARER + +# JAAS Configuration +sasl.jaas.config= \ + org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required \ + clientId='YOUR_CLIENT_ID' \ + clientSecret='YOUR_CLIENT_SECRET' \ + extension_logicalCluster='YOUR_LOGICAL_CLUSTER' \ + extension_identityPoolId='YOUR_IDENTITY_POOL_ID' \ + scope='YOUR_SCOPES' \ + audience='YOUR_API_IDENTIFIER'; +``` + +4. In my case, I have used the following config + +``` +bootstrap.servers=pkc-p11xm.us-east-1.aws.confluent.cloud:9092 +security.protocol=SASL_SSL +sasl.oauthbearer.token.endpoint.url=https://dev-jpup1hj0aphkbijm.us.auth0.com/oauth/token +sasl.login.callback.handler.class=org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler +sasl.mechanism=OAUTHBEARER +sasl.jaas.config= \ + org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required \ + clientId='8GJmNPXY5UV1yrSNgc1ggiKkYcHiifFM' \ + clientSecret='wSQzDaI-MRB80w2HyzKbV-JjPS4Ijd5zUu10LdisgEDR7_LRoC98ruBGgnLd_Lha' \ + extension_logicalCluster='lkc-37vn0o' \ + extension_identityPoolId='pool-AXqR' \ + scope='read:users'; +``` + +_Note that, you should never share your client details, especially the client secret. I have deleted all instances of my application, hence I haven't redacted the client secret._ + +5. Here we can add as many permissions as we want to access under the scope parameter. But this can be done only if we have allowed the access in the identity pool. + +6. We can use this client.properties to access the kafka cluster depending on the amount of permissions we have. + + +## Conclusion + +In conclusion, integrating Auth0 as an identity provider for Confluent Cloud significantly enhances the security and efficiency of your Kafka clusters. By leveraging the power of OAuth 2.0 and OpenID Connect, you can ensure that user credentials are protected, and access is precisely controlled. This setup not only streamlines the authentication process but also provides a robust framework for managing permissions and identities. As you follow the steps outlined in this guide, you'll find that configuring Auth0 with Confluent Cloud is straightforward and highly beneficial for maintaining a secure, scalable, and user-friendly environment for your event streaming applications. diff --git a/_posts/2024-07-30-Unlocking-the-Potential-of-Local-Kafka.md b/_posts/2024-07-30-Unlocking-the-Potential-of-Local-Kafka.md new file mode 100644 index 0000000000..6b9371cc2b --- /dev/null +++ b/_posts/2024-07-30-Unlocking-the-Potential-of-Local-Kafka.md @@ -0,0 +1,147 @@ +--- +layout: post +title: "Unlocking The Potential of Local Kafka" +categories: [Apache Kafka, Local Kafka, Docker Compose, Ansible ] +teaser: Unlock the full potential of local Kafka setups for rapid development, testing, and seamless management and discover the best method for your local kafka setup. +authors: Subramanya +featured: false +hidden: false +image: /assets/blog-images/Unlocking-the-Potential-of-Local-Kafka/Blog_image.png +toc: true +--- + + +# Introduction + +Apache Kafka is a distributed streaming platform widely used for building real-time data pipelines and streaming applications. While cloud-based Kafka services are available, setting up Kafka locally offers a range of benefits, especially for development and testing. In this blog, we will explore why a local setup is beneficial, the various methods to achieve it, and provide a step-by-step guide for those methods and additionally we’ll be introducing few resources for setting up kafka locally, designed to simplify your testing and development processes, enabling you to deploy, manage, and monitor Kafka clusters efficiently on your local machine + + +# Problem Statement + +When working on small tasks and testing in cloud environments, developers often face significant challenges. + +**Initial Setup Time**: Provisioning a Kafka cluster can take considerable time, especially for initial setups. For example, deploying a managed Kafka service (MSK) using AWS CDK can take around 20 minutes for the first run​ [(GitHub)](https://github.com/mariusfeteanu/kafka-on-cdk). This setup time is comparable across other major cloud providers such as Azure and Google Cloud Platform (GCP). + +**Cluster Creation**: Provisioning a Kafka cluster typically takes longer compared to producing/consuming messages. Creating a Kafka cluster on Confluent Cloud involves provisioning the necessary infrastructure and configuring settings. According to Confluent, this process takes approximately 10 minutes for a standard three-broker cluster. + +**Produce/Consume Messages**: These involve the actual use of the resources, such as producing and consuming messages, and typically have very low latency, often under 10 milliseconds for individual operations. + +**Message Latency**: The 99th percentile end-to-end latency (p99 E2E latency) for Confluent Cloud is generally low, with measurements such as 42 milliseconds at 10 MBps throughput and up to 73 milliseconds at 1.4 GBps throughput​ [(Confluent)​.](https://www.confluent.io/blog/kafka-vs-kora-latency-comparison/) + +**Frequency of Occurrence**: Although the significant time investment is primarily during the initial setup. Once the infrastructure is in place, making minor adjustments or running tests is faster, although any changes that require re-provisioning of resources will still take time. + +The above timeframes of the Cluster creation and Producing/Consuming messages highlight the difference between setting up resources and using them, illustrating why developers might prefer local setups for rapid iterations and tests. + + +# Why Set Up Kafka Locally? + + + +* **Development and Testing:** Local setups allow for rapid development cycles and immediate testing without the latency or cost implications of cloud services. + +* **Platform Independence:** Using Docker and Docker-Compose, a local Kafka setup can be made platform-independent, allowing you to develop and test on any machine without worrying about compatibility issues. + +* **Greater Control:** A local setup provides direct access to Kafka’s components, enabling finer-grained control over configuration, logging, and monitoring, which is essential for complex event-driven architectures. + +* **Cost-Effective:** Eliminates the need for cloud resources, making it cost-effective, especially for small teams or individual developers and when developers forget to delete the resources. + + +# Methods for Setting Up Kafka Locally + + + +* **Manual Installation**: Downloading and configuring Kafka and Zookeeper manually on your machine. This method provides deep insights into Kafka's workings but is time-consuming. + + After downloading and configuring Kafka and Zookeeper (deprecated since 3.5 and removed in 4.0) manually on your machine, you can create a cluster by starting multiple instances of Kafka and Zookeeper on different ports. + + + Refer the official Apache Kafka Documentation for a step by step guide: [Apache Kafka Quickstart](https://kafka.apache.org/quickstart) + +* **Using Package Managers**: Using tools like Homebrew for macOS and similarly apt and choco for Linux and Windows respectively for installation can simplify the installation process. After installation to create a Kafka cluster, similarly to the manual installation you need to start multiple instances of Kafka brokers with different configurations. + +* **Docker**: Using Docker to containerize Kafka and Zookeeper, making the setup process quicker and consistent. Docker provides a portable and lightweight environment that ensures your Kafka setup works the same way across different systems. + +* **Docker Compose**: Docker Compose further simplifies this process by allowing you to define and manage multi-container Docker applications. With a single YAML file, you can specify the services, networks, and volumes required for Kafka and Zookeeper, making it easy to deploy and manage your Kafka environment consistently across different development and testing setups. + + +## Preferred Method for Local Installation + +Using Docker Compose to set up Kafka locally is the best approach out of these due to its consistency, ease of use and scalability i.e the ability to easily adjust the number of Kafka and Zookeeper instances, allocate appropriate resources, manage deployments efficiently, and ensure consistent performance and isolation across different environments. + +Manual Installation and Installation via Package Managers are relatively the same, as using a Package manager just eliminates the step of downloading the installation file manually, both the methods lack the portability and consistency that Docker offers, making them less ideal for replicating environments across different machines, and Setting up and configuring each instance manually or via package managers is time-consuming and prone to errors. + +Hence Docker Compose’s ability to provide a consistent setup across different machines, along with powerful tools for monitoring and troubleshooting, makes it the ideal solution for running Kafka locally. + + +# So how do you make use of a Local kafka Setup? + +At Platformatory, we leverage Kafka Sandbox which is setup locally and use it for testing and troubleshooting processes. The Kafka sandbox is an isolated environment mimicking a cloud setup using a docker-compose setup, along with Prometheus and Grafana deployed for observability and debugging there by achieving all these without the overhead of cloud infrastructure. + + +## What are the benefits of using the Sandbox? + +**Realistic Testing and Performance Tuning:** The benefit of having a setup similar to Production Environment is you can accurately simulate how Kafka and associated services will behave under real-world conditions to a certain degree and Potential issues and bugs can be identified and resolved before they impact the production environment. + +We can conduct performance and load testing to understand how the system scales and performs under various conditions, and fine-tuning the configurations and resource allocations in the sandbox according to the performance can help optimize the production setup. + +**Monitoring and Metrics:** By integrating Prometheus and Grafana, we continuously monitor Kafka’s performance, health and allows us to observe the cluster performance when we experiment with the configuration changes and also helps us to proactively identify and address any potential bottlenecks or issues. + +**Configuration Management:** The sandbox uses properties files mounted as volumes, which helps replicate certain aspects of a VM or bare-metal server environment. This includes persistent configuration management and network settings that provide isolation and resource allocation similar to traditional server setups. This allows us to manage configurations easily and consistently across different environments. However, it is essential to note that containers do not completely replicate the behavior of VMs or bare-metal servers, particularly regarding hardware interaction and persistent storage. + + +# Using the Confluent Platform (CP) Sandbox + +The CP sandbox is built using Docker Compose, which allows us to define and run multi-container Docker applications. Docker-compose file for Confluent Kafka with configuration mounted as properties files, brings up Kafka and components with JMX metrics exposed and visualized using Prometheus and Grafana. The environment simulates running Confluent Platform on VMs/Bare metal servers using properties files but using docker containers. + +Access the [CP-Sandbox ](https://github.com/Platformatory/cp-sandbox.git)repository from here. + +The cluster contains the following resources deployed: + + + +* 1 Zookeeper +* 3 Kafka Brokers +* LDAP Server +* Control Center +* Kafka Client Container +* Prometheus and Grafana + +Check the kafka server.properties for more details about the configurations of the setup. + + +## Running the CP Sandbox + +Clone the Repository: Start by cloning the repository containing the Docker Compose file and configuration files, you can access the repository via this link. + +```bash +git clone git@github.com:Platformatory/cp-sandbox.git +``` + +```bash +cd cp-sandbox +``` + +**Start the Services:** Use Docker Compose to start all the services defined in the docker-compose.yml file. + +```bash +docker-compose up -d +``` + +This command will bring up a three-node Kafka cluster with security enabled, along with other components like Zookeeper, Schema Registry, Kafka Connect, Control Center, Prometheus, Grafana, and OpenLDAP. There is a main branch and 12 Branches that are each troubleshooting scenarios for you to switch between and attempt. + +**Check Service Status:** Verify that all services are up and running. + +```bash +docker-compose ps -a +``` + +Ensure there are no exited services and all containers have the status Up. + + +# Conclusion + +Setting up Kafka locally unlocks immense potential for developers, offering a swift, cost-effective, and controlled environment for development, testing, and troubleshooting. Whether you choose manual installation, package managers, or Docker Compose, a local Kafka setup can significantly enhance your development workflow. Among these methods, Docker Compose stands out for its consistency, scalability, and ease of use, making it the ideal solution for running Kafka on your local machine. + +By using the CP Sandbox, you can efficiently set up, manage, and gain insights into troubleshooting Kafka clusters. The troubleshooting scenarios in the repository allow you to practice and resolve potential production issues in a controlled environment. + +Sounds interesting? If you wish to know more, take a look at our next blog titled “Learning by Debugging: Kafka Local Setup” where we dive deeper into troubleshooting scenarios and make use of the monitoring tools like Grafana which is deployed along with the Sandbox and how to set them up. In the next blog we also take a look at CP Ansible Sandbox, a setup which can be used for automated deployment and management of Confluent Platform clusters. diff --git a/_posts/2024-07-31-Learning-by-Debugging-Kafka-Local-Setup.md b/_posts/2024-07-31-Learning-by-Debugging-Kafka-Local-Setup.md new file mode 100644 index 0000000000..3ba7b610f2 --- /dev/null +++ b/_posts/2024-07-31-Learning-by-Debugging-Kafka-Local-Setup.md @@ -0,0 +1,400 @@ +--- +layout: post +title: "Learning by Debugging: Kafka Local Setup" +categories: [Apache Kafka, Local Kafka, Docker Compose, Ansible, Kafka Troubleshooting ] +teaser: Discover how to set up and troubleshoot a local Kafka environment using Docker Compose and Ansible, gain hands-on experience and debugging skills in a safe, isolated setting. Dive into real-world scenarios and enhance your Kafka expertise. +authors: Subramanya +featured: false +hidden: false +image: /assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/title.webp +toc: true +--- + +# Introduction: + +In the realm of data streaming and event-driven architectures, hands-on experience and real-time troubleshooting is invaluable. This blog aims to provide an in-depth guide to setting up and using a local Kafka environment using Docker-compose, by leveraging a sandbox setup, you can experiment, test, and debug Kafka configurations in an isolated environment that mimics a cloud setup, why use an environment that mimics a cloud setup you ask? We have a blog on the exact thing and it explains why having a local kafka setup is very essential. Make sure to check it out too [Unlocking the Potential of Local Kafka.](https://platformatory.io/blog/Unlocking-the-Potential-of-Local-Kafka/) + +A sandbox is an isolated environment that allows developers to safely test and troubleshoot without impacting the production environment. This makes it an essential tool for ensuring that code changes and configurations work correctly before deployment. In this blog, we will introduce the Confluent Platform Sandbox and Confluent Platform Ansible Sandbox, powerful tools that simplify the process of setting up and managing Kafka clusters locally using Docker Compose. + +We will guide you through the step-by-step process of setting up the CP Sandbox, show how it can be used for testing and debugging, and delve into various troubleshooting scenarios. Additionally, we will cover the integration of monitoring tools like Prometheus and Grafana for enhanced observability and debugging. Finally, we will introduce the CP Ansible Sandbox, which automates the deployment and management of Kafka clusters, further streamlining your development workflow. + + +# The CP Sandbox- Usage and Setup: + +The CP Sandbox is built using Docker Compose, which allows us to define and run multi-container Docker applications seamlessly. This setup includes a comprehensive environment with Confluent Kafka and its essential components, making it ideal for simulating a production-like setup on local machines. Here’s a detailed look into its setup and components: + +**Configuration Mounting**: The Docker Compose file mounts configuration files (properties files) into the containers. These properties files are critical as they allow customization of each component’s settings, ensuring the environment behaves just like it would on VMs or bare-metal servers to a certain extent. + +**Simulating Production Environments** + +The environment simulates running Confluent Platform on VMs or bare-metal servers but leverages Docker containers for convenience and portability. This simulation is achieved by: + +**Properties Files**: Using properties files to configure Kafka and its components, mirroring how configurations would be managed in a non-containerized setup. This approach ensures that the transition from development to production is smooth, as the configurations remain consistent. + +**Docker Containers**: Docker containers encapsulate each service, providing isolation and consistency. This means you can run the same environment on any machine with Docker installed, ensuring that your setup behaves predictably across different development and testing environments. + +**Troubleshooting Scenarios** + +The CP Sandbox repository includes various branches, each containing different troubleshooting scenarios. These scenarios are designed for Kafka administrators to practice handling production-like issues. Some examples include: + +**Topic Access Issues**: Simulate the inability to produce or consume from a newly created topic. This helps administrators practice checking and configuring ACLs, verifying user permissions, and ensuring network connectivity. + +**Missing Broker Metrics:** Address issues where broker metrics are not captured in the control center despite setting up Confluent Metrics Reporter. This scenario helps in verifying configuration settings and connectivity between brokers and the control center. + +**Platform Migration Issues**: Simulate issues following a lift-and-shift migration to different machines, such as SSL handshake failures or configuration mismatches. Administrators can practice resolving these problems by examining error logs and reconfiguring components. + +These scenarios provide hands-on experience in diagnosing and resolving issues, making administrators better prepared for real-world challenges. + + +## Here is a step by step guide on setting up the CP Sandbox: + +Clone the Repository: Start by cloning the repository containing the Docker Compose file and configuration files, you can access the repository via this [link](https://github.com/Platformatory/cp-sandbox.git). + +```bash +git clone git@github.com:Platformatory/cp-sandbox.git +``` + +```bash +cd cp-sandbox +``` + +Start the Services: Use Docker Compose to start all the services defined in the docker-compose.yml file. + +```bash +docker-compose up -d +``` + +This command will bring up a three-node Kafka cluster with security enabled, along with other components like Zookeeper, Schema Registry, Kafka Connect, Control Center, Prometheus, Grafana, and OpenLDAP. + +NOTE: Be sure to update or extend the validity of the SSL certificates for the Kafka brokers. Expired certificates can cause broker containers to go down due to authentication errors resulting from invalid certificates. Use the blog post [here](https://www.golinuxcloud.com/renew-self-signed-certificate-openssl/) for a guide on how to renew an expired SSL certificate. + +Use the below command to inspect the keystore and check the validity of the certificate. + +```bash +keytool -list -v -keystore <> +``` + +(The keystore file, kafka.server.keystore.jks is located in the broker folder for each broker) + +Check Service Status: Verify that all services are up and running. + +```bash +docker-compose ps -a +``` + +Ensure there are no exited services and all containers have the status Up. + +If any of the containers have gone down, examine the logs of the respective containers as shown below to get a better understanding of the issue for troubleshooting. + +Logging: Check the logs of the respective service by its container name. + +```bash +docker logs <> +``` + +Monitoring: For monitoring, Prometheus and Grafana are configured to visualize JMX metrics. + +Using the Kafka Client: To interact with the Kafka cluster, you can use the Kafka client container. + +```bash +docker exec -it kfkclient bash +``` + +The Kafka client container contains the Kafka CLI and other tools necessary for troubleshooting Kafka. It also includes a properties file mounted to /opt/client that can be used to define client properties for communicating with Kafka. + +Restarting Services: If needed, you can restart a particular service. + +```bash +docker-compose restart <> +``` + +Alternatively, you can force recreate the service. + +```bash +docker-compose up -d --force-recreate <> +``` + +Below is a part of the docker-compose file from the repository which sets up the Confluent Kafka and Zookeeper resources with JMX metrics exposed, using Docker containers. + +```yaml +--- + +version: '3.7' +services: + zookeeper1: + image: confluentinc/cp-zookeeper:7.4.0 + hostname: zookeeper1 + container_name: zookeeper1 + ports: + - "2181:2181" + command: zookeeper-server-start /etc/kafka/zookeeper.properties + environment: + EXTRA_ARGS: -javaagent:/usr/share/jmx-exporter/jmx_prometheus_javaagent-0.20.0.jar=9103:/usr/share/jmx-exporter/zookeeper.yml + volumes: + - ./zookeeper1:/etc/kafka + - ./jmx-exporter:/usr/share/jmx-exporter + deploy: + resources: + limits: + cpus: "1" + memory: 512M + + kafka1: + image: confluentinc/cp-server:7.4.0 + hostname: kafka1 + container_name: kafka1 + depends_on: + - zookeeper1 + command: kafka-server-start /etc/kafka/server.properties + environment: + EXTRA_ARGS: -javaagent:/usr/share/jmx-exporter/jmx_prometheus_javaagent-0.20.0.jar=9101:/usr/share/jmx-exporter/kafka_broker.yml + volumes: + - ./kafka1:/etc/kafka + - ./jmx-exporter:/usr/share/jmx-exporter + deploy: + resources: + limits: + cpus: "1.5" + memory: 1536M +``` + +The current setup uses Confluent Platform version 7.4.0 as specified by the image confluentinc/cp-server:7.4.0 in the above code, this version is compatible with the 3.4 version of Apache Kafka (Refer [here ](https://docs.confluent.io/platform/current/installation/versions-interoperability.html)for Supported Versions and Interoperability for Confluent Platform) + +Visit the [Confluent Docker Hub](https://hub.docker.com/u/confluentinc) to find the latest version of the Confluent Kafka images and use the image for upgrading to the latest version. + + +# Troubleshooting using the Platformatory Kafka Sandbox: + +The sandbox includes 12 distinct troubleshooting scenarios, selected from key Kafka concepts and potential production issues. Each scenario is available on a different branch of the repository, allowing you to switch between branches to practice and resolve various Kafka-related challenges. + +Below are a few scenarios from the repository: + +**Scenario 1** + +**Problem Statement** + +The client has created a new topic europe_payments but is unable to produce/consume from the topic from the host kfkclient using the user kafkaclient1 using the following commands - + +```bash +kafka-console-producer --bootstrap-server kafka1:19092 --producer.config /opt/client/client.properties --topic europe_payments +``` + +```bash +kafka-console-consumer --bootstrap-server kafka1:19092 --consumer.config /opt/client/client.properties --from-beginning --topic europe_payments +``` + +The client is using SASL/PLAIN over PLAINTEXT with the user kafkaclient1 + +The error message seen in the console producer and consumer for europe_payments - + +```bash +[2023-07-26 12:18:20,309] WARN [Producer clientId=console-producer] Error while fetching metadata with correlation id 4 : {europe_payments=TOPIC_AUTHORIZATION_FAILED} (org.apache.kafka.clients.NetworkClient) + +[2023-07-26 12:18:20,409] ERROR [Producer clientId=console-producer] Topic authorization failed for topics [europe_payments] (org.apache.kafka.clients.Metadata) + +[2023-07-26 12:18:20,411] ERROR Error when sending message to topic europe_payments with key: null, value: 6 bytes with error: (org.apache.kafka.clients.producer.internals.ErrorLoggingCallback) + +org.apache.kafka.common.errors.TopicAuthorizationException: Not authorized to access topics: [europe_payments] +``` + +**Scenario 2:** + +**Problem Statement** + +There are no broker metrics captured in the control center. The control center has a message - Please set up Confluent Metrics Reporter to view broker metrics. The customer has setup Confluent Metrics Reporter but is still unable to view broker metrics + +**Scenario 3** + +**Problem Statement** + +The client just performed a lift and shift on their entire platform to different machines. The brokers and several other components are down. + +The brokers have the following error log - + +```bash +java.lang.RuntimeException: Received a fatal error while waiting for all of the authorizer futures to be completed. + at kafka.server.KafkaServer.startup(KafkaServer.scala:950) + at kafka.Kafka$.main(Kafka.scala:114) + at kafka.Kafka.main(Kafka.scala) + +Caused by: java.util.concurrent.CompletionException: org.apache.kafka.common.errors.SslAuthenticationException: SSL handshake failed + at java.base/java.util.concurrent.CompletableFuture.encodeRelay(CompletableFuture.java:367) + at java.base/java.util.concurrent.CompletableFuture.completeRelay(CompletableFuture.java:376) + at java.base/java.util.concurrent.CompletableFuture$AnyOf.tryFire(CompletableFuture.java:1663) + at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) + at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) + at io.confluent.security.auth.provider.ConfluentProvider.lambda$null$10(ConfluentProvider.java:543) + at java.base/java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:986) + at java.base/java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:970) + at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) + at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) + at io.confluent.security.store.kafka.clients.KafkaReader.lambda$start$1(KafkaReader.java:102) + at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) + at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) + at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) + at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) + at java.base/java.lang.Thread.run(Thread.java:829) +``` + +Each of the scenarios are presented in a similar way with a problem statement, detailing the issue the user is encountering, the commands attempted while facing the issue, and the error message received that can be used for troubleshooting the issue. + + +# Monitoring and Metrics: + +Each Kafka and Zookeeper instance is configured with a JMX exporter, which exposes metrics in a format that Prometheus can scrape, prometheus scrapes these metrics from the JMX exporters at regular intervals. Grafana queries Prometheus for the stored metrics and presents them in various visual formats, such as graphs, charts, and tables. + +```yaml + EXTRA_ARGS: -javaagent:/usr/share/jmx-exporter/jmx_prometheus_javaagent-0.20.0.jar=9101:/usr/share/jmx-exporter/kafka_broker.yml +``` + +In the above code snippet taken from the Docker Compose configuration, the EXTRA_ARGS environment variable is used to enable JMX (Java Management Extensions) monitoring by specifying a Java agent. This agent collects metrics and exposes them in a format that can be scrapped by Prometheus. + +**Accessing the Grafana Dashboard:** + +Open your browser and go to http://localhost:3000. The default login credentials are admin for both the username and password. You can change these after the first login for security purposes. + +**Explore Dashboards:** + +Once logged in, navigate to the pre-configured dashboards to start monitoring the Kafka cluster. You can create additional dashboards and visualizations as needed. + +Below are a couple of snapshots from the Kafka Cluster Dashboard. + + +![grafana_image1](../assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF1.png) + + +![grafana_image2](../assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF2.png) + + +This dashboard houses some of the key metrics for the Cluster monitoring such as Basic Broker Metrics, Request Rate Metrics and System Metrics such as CPU Usage and JVM Memory Usage. + +Another resource we use here at Platformatory is CP-Ansible Sandbox, This sandbox provides an automated way to set up and manage Kafka environments using Ansible playbooks, making the deployment process fully automated. + + +# Introduction to the CP-Ansible Sandbox + +The CP-Ansible sandbox can be used for automated deployment and management of Confluent Platform clusters. This sandbox environment allows us to simulate and manage our Kafka infrastructure efficiently using Ansible playbooks within a Dockerized setup. This provides a seamless and consistent environment for testing, deploying, and managing Kafka clusters. + +The CP-Ansible sandbox is designed to run within a Docker environment. It includes a control node container and multiple inventory nodes, all managed through systemd and SSH. This setup enables us to execute Ansible playbooks to manage the Kafka clusters from a central control node. + +The Cluster contains the following resources deployed: + + + +* Ansible Control Node +* 3 Zookeeper +* 3 Kafka Brokers +* Control Center + + +# Setting Up the CP-Ansible Sandbox + +**Cloning the Repository**: Start by cloning the repository, you can access the repository via the [link](https://github.com/Platformatory/cp-ansible-sandbox.git). + +```bash + Git clone git@github.com:Platformatory/cp-ansible-sandbox.git +``` + +```bash +git submodule update --init --recursive +``` + +**Starting the Docker Environment:** + +```bash +docker compose up -d +``` + +**Verifying Containers:** + +```bash +docker compose ps -a +``` + +**Setting Up SSH Keys:** + +```bash +./setup-ssh-keys.sh +``` + +**Accessing the Ansible Control Node:** + +```bash +docker compose exec -it ansible-control bash +``` + +Once the environment is set up, the control node can manage all inventory nodes via SSH. The Ansible playbooks can be executed from this control node to deploy and manage the Confluent Platform. + +**Verifying SSH Connectivity** + +Inside the Ansible control node, verify the SSH connectivity with other nodes: + +```bash +ssh root@zookeeper1 +``` + +# Running the CP-Ansible Playbook + +Install the Confluent Platform playbook: + +Run the below command inside the Ansible Control Node Container + +```bash +ansible-playbook confluent.platform.all +``` + +This assumes the inventory file is located at /ansible/inventories/ansible-inventory.yml. + +Below is the inventory script for setting up the cluster. + +```yaml +--- + +all: + vars: + ansible_connection: ssh + ansible_user: root + ansible_become: true + ansible_ssh_private_key_file: + +zookeeper: + hosts: + zookeeper1: + zookeeper2: + zookeeper3: + +kafka_broker: + hosts: + kafka1: + kafka2: + kafka3: + +control_center: + hosts: + control-center: +``` + +**Custom Inventory Files** + +To use a custom inventory file: + +```bash +ansible-playbook -i /path/to/custom/inventory-file confluent.platform.all +``` + +**Managing the Inventory** + +The inventory files are located in the inventories directory and are mounted to /ansible/inventories/ on the control node container. You can create new inventory files or update existing ones locally, and these changes will be reflected on the control node. + +**Additional Files** + +Any additional files needed for the Ansible playbooks can be added to the share directory locally. These files will be available in /usr/share/cp-ansible-sandbox on the Ansible control node. + + +# Conclusion + +By using the CP Sandbox and CP-Ansible Sandbox, developers and administrators can effectively set up, manage, and troubleshoot Kafka clusters in a controlled, isolated environment. These sandboxes offer a hands-on approach to learning, testing, and debugging Kafka configurations, mimicking a production setup without the associated risks. The CP Sandbox provides an environment built with Docker Compose, allowing for easy setup and management of multi-container applications, while the CP-Ansible Sandbox automates the deployment process using Ansible playbooks, ensuring consistency and efficiency. + +Whether you're a seasoned Kafka administrator looking to sharpen your troubleshooting skills or a developer wanting to experiment with Kafka configurations, these sandboxes provide the tools and scenarios necessary for mastering Kafka management. Through practical exercises and real-world scenarios, you'll gain invaluable experience in diagnosing and resolving issues, ultimately preparing you for the complexities of managing Kafka in a production environment. + +So, dive into the CP Sandbox and CP-Ansible Sandbox, explore the various troubleshooting scenarios, and harness the power of these tools to enhance your Kafka expertise. Happy Kafkaing! diff --git a/_posts/2024-08-13-Automating-Kafka-Deployments-with-CICD.md b/_posts/2024-08-13-Automating-Kafka-Deployments-with-CICD.md new file mode 100644 index 0000000000..507d15c2ee --- /dev/null +++ b/_posts/2024-08-13-Automating-Kafka-Deployments-with-CICD.md @@ -0,0 +1,171 @@ +--- +layout: post +title: "Automating Kafka Deployments with CI/CD" +categories: [Kafka Architecture, Distributed Systems, DevOps, CI/CD] +author: Vikhyat +teaser: Unlock seamless Kafka cluster management with Kafka CP Deploy Manager! Discover how we can make deployments easier, mitigate human error, and ensures smooth upgrades— all with a user-friendly interface that doesn't require deep Kafka expertise. Ready to elevate your Kafka operations? Dive into our latest blog post! +image: assets/blog-images/Automating-Kafka-Deployments-with-CICD/banner.jpg +toc: true +--- + +# Automating Kafka Deployments with CI/CD + +Deploying Kafka, whether the open-source version or Confluent Platform, presents a myriad of challenges like managing dependencies and ensuring compatibility to handling scaling and fault tolerance, the process can be intricate and time-consuming. This blog delves into the current deployment options for Kafka, the challenges faced, and how a CI/CD implementation can revolutionize your deployment strategy. + + +## **Current Kafka Deployment Options** + +We won't cover manual deployments since they aren't typically used in production environments. Instead, there are two main methods for deploying Kafka: + +1. **Ansible**: Ansible automates Kafka deployment through the use of playbooks, which are YAML files defining the deployment process. By leveraging the Ansible you can configure, deploy, and manage Kafka clusters with minimal manual intervention. Ansible’s agentless architecture allows it to work seamlessly across different environments, making it a flexible and reliable option for managing Kafka deployments. + +2. **Kubernetes**: Deploying Kafka on Kubernetes allows you to leverage container orchestration, providing scalability, fault tolerance, and ease of management. Kubernetes can manage the entire lifecycle of Kafka clusters, from deployment and scaling to upgrades and failure recovery. Tools like Strimzi or the Confluent Operator can be used to simplify the deployment and management of Kafka on Kubernetes. + +Deploying Kafka or the Confluent Platform is a multifaceted process that requires careful planning, configuration, and ongoing management to ensure optimal performance and reliability. The complexity arises from the need to balance various factors such as hardware requirements, security, data management, monitoring, and scaling. Expertise and continuous attention to detail are crucial for successful deployments. + + +## **Challenges in Kafka Deployment** + +Ansible deployed clusters + +- Requires familiarity with Ansible and its configuration. + +- Playbooks might need adjustments for complex or non-standard setups. + +And Kubernetes, + +- Requires a Kubernetes environment and knowledge of Kubernetes operations. + +- Initial setup and configuration might be complex for those unfamiliar with Kubernetes. + + +## **How CI/CD can work for Kafka Deployment** + +Implementing Continuous Integration and Continuous Deployment (CI/CD) for Kafka offers several benefits: + +- **Single Source of Truth**: Centralized configuration management ensures consistency across environments. + +- **Administrator-Regulated Using Version Control**: Changes are tracked, audited, and managed through version control systems like Git, enhancing security and compliance. + +- **Ease of Rolling Updates**: Automated deployment pipelines facilitate seamless updates and reduce the risk of human errors. + +- **One-Push Mechanism**: Simplified deployment process through a user-friendly interface, eliminating the need for in-depth Kafka expertise. + + +## **In comes kafka-cp-deploy-manager** + +This is the solution we've devised, called `kafka-cp-deploy-manager`. It aims to address some of the limitations in existing solutions, which will be discussed later in this blog. + +Managing Kafka clusters can be challenging, with tasks like version upgrades, configuration changes, and scaling to meet demand requiring careful coordination. The `kafka-cp-deploy-manager` streamlines this process by managing the state of Kafka clusters and automating deployments to reach the desired configuration. + +We've built a pre-configured pipeline using Confluent’s Ansible-based deployment solution for Kafka, enhancing its usability. This tool allows us to create deployment pipelines for other methods like open-source Apache Kafka and Confluent for Kubernetes (CFK). + + +### **How It Works** + +`kafka-cp-deploy-manager` works alongside Jenkins for continuous deployment on any state changes. Jenkins abstracts away the deployment complexity by providing simplified configurable variables in an interactive UI. This integration ensures that any state changes are deployed consistently and reproducibly, mitigating human error. + + +### **Sample Configuration** + +To illustrate the effectiveness of `kafka-cp-deploy-manager`, let's look at a sample configuration. We can use a yaml file like this to set up custom pipelines. Here’s what you need to set up: + +``` +all: + vars: + ansible_connection: ssh + ansible_user: root + ansible_become: true + ansible_ssh_private_key_file: /path/to/private-key + git_repository_url: "artifacts_repo_url" + git_token: "token" + + jenkins: + hosts: + hostname: + vars: + pipeline: "pipeline_name" + environment: + JENKINS_HOME: /data/jenkins + + jenkins_agent: + hosts: + hostname: + vars: + collections: /path/to/confluent_files.jar +``` +- `artifacts_repo_url`: The repository containing the artifacts. + +- `token`: To access the Git repository. + +- `pipeline_name`: The pipeline that needs to be run. + +- `/path/to/confluent_files.jar`: Specifies where the necessary jar files are located. + +These configurations enable users to manage their Kafka clusters effectively with a simple, configurable interface without requiring an in-depth understanding of Kafka. + + +### **Example Scenario:** + +Let us see how easily we can upgrade the confluent platform using `kafka-cp-deploy-manager` + + +### **Upgrading Kafka with kafka-cp-deploy-manager** + +We have a Confluent Platform version 7.4.3 installed on an EC2 instance. Now, we will upgrade Kafka to version 7.5.3. + +1. Check the current version of Confluent Platform + + ![Image6](../assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1.png) + +2. **Access the Ansible Install Pipeline**: Navigate to the pipeline dashboard. + + Image10 + +3. **Add Required Parameters**: + + 1. **URL of Git Repository**: Provide the repository URL. + + Image10 + + 2. **Git Repo Access Token**: Input the access token. + + Image10 + + 3. **Branch**: Specify the branch for the new version. + + 4. **Confluent Platform Version**: Set to 7.5.3. + + Image10 + + 5. **Private Key of EC2 Instance**: Provide the private key for the instance. + + Image10 + +4. **Trigger the Build**: Click on 'Build' to run the pipeline. + + Image10 + +5. **Monitor the Console Output**: Watch the pipeline run in the console output. + + Image10 + +6. **Verify the Upgrade**: Check the Confluent Platform version in the EC2 instance to ensure it has been upgraded to 7.5.3.! + + Image10 + + +## Conclusion + +`kafka-cp-deploy-manager` is a powerful tool that helps users manage their Kafka clusters efficiently. With its consistent and reproducible deployment pipelines, minimized human error, smooth upgrades and rollbacks, and clear separation of privileges, it simplifies Kafka cluster management. Users can achieve effective Kafka cluster management with a simple, configurable interface, making the process accessible even without deep Kafka knowledge. + + +### **Key Benefits** + +1. **Consistent and Reproducible Deployment Pipelines**: Ensures that deployment pipelines are consistent, reducing the chances of errors and inconsistencies. + +2. **Mitigates Human Error**: By automating the deployment process, it minimizes human error, making the process more reliable. + +3. **Smooth Upgrades and Rollbacks**: Version changes to the state file are easily managed, ensuring smooth upgrades and rollbacks. + +4. **Isolation Between Write and Execute Privileges**: Admins can create or modify state files, while developers can trigger the deployment process, ensuring clear separation of responsibilities. diff --git a/_posts/platform-strategy-back.md b/_posts/platform-strategy-back.md new file mode 100644 index 0000000000..a9d6fda3dd --- /dev/null +++ b/_posts/platform-strategy-back.md @@ -0,0 +1,91 @@ +--- +layout: post +title: "A guide to platform strategy" +author: p6 +categories: [ Platform Engineering, Data, Infrastructure, Kafka, Kong, Kubernetes ] +image: assets/blog-images/platform-strategy.svg +featured: true +hidden: true +teaser: A technology leader's guide to operating like a digital native +toc: true +--- + +# Prelude + +Since we started Platormatory in 2021, much of work has been focused on niche themes surrounding "platform play" & engineering in enterprise Digital / IT organizations. We've come across several recurring themes around organizational, cultural and technological challenges that inhibit customers from optimizing their digital operating model, This thesis is a distillation of our consulting experience and specifically, the mental models that will enable technology leaders to adopt & execute to a sound digital platform strategy. + + +# Why is a Platform Strategy so important? + +IT has changed over the years: Well up to the mid 2000s, most IT orgs operated as a services centric organization that administered servers, infrastructure and big ticket enterprise ~~bloatware~~ software; In contrast, modern digital organizations operate a heterogenous, multi-cloud stack comprising a plethora of complex applications -- the space of which has exploded. DevOps is almost ubiquitous. + +Or is it really? + +Let's inspect the problem with a healthy amount of skepticism. Don't get us wrong: Most organizations claim to be Agile enabled (albeit embrace it with a liberal sprinkling of "two-speed IT" amongst other tropes); There have been multi year transformation programs; Org structures have changed and much makeover has happened. However, gaps remain at large: Old wine in new bottle. + +Let's begin by calling a few things out: + +1. Most teams are barely product oriented: Organizations still tend to look at "software systems" and hyperspecialized teams rather than true product orientation. +2. Business IT barrier: IT still owns governance functions - so DevOps teams are barely empoerwd in practice and anything that hits IT is a case study in the theory of constraints. +3. Dev(Sec)Ops is actually mostly ops -- much less, Dev or Sec. +4. Data is typically a large scale analytics silo + + +We could go on and on. If you think these are strawmen, look deeply into your organizations and introspect. Even with somewhat mature Agile enterprises, we see these dysfunctions at some level or the other. + +It isn't that hyperspecialized organzations and silos are universally bad in themselves. Infact, every silo is actually a context bound efficiency center. The problem is more that truly DevOps oriented - "you build it, you run it" teams are too risky an operating model in the enterprise at scale. This realization leads to the slipper slope of half measures, resistance, badly designed team topologies and hampered decision making at all levels leading to the dysfunction we described previously. + +# How do you build a Platform-Product Strategy fit to purpose? + +We feel that the most important trend IT / Digital organizations need to subscribe to is to adopt platforms. + +Let's begin by defining Platforms: + +> A platform is a foundation for enterprise applications, that enables diverse product teams to do more while inventing fewer bespoke solutions and enabling at-scale operations and governance. + +> Platform Engineering is the collective set of practices & patterns associated with building and running platforms and opinionated by the lens of software engineering practices applied in the context of operations. + +Why platforms? primarily because in the business that you run, your efficiency is best directed at running the business, rather than being in the business of platform software itself. This can’t be stressed enough; We see digital platform teams go down the rabbit hole of attempting to be pure tech, sometimes out of aspiration and other times out of misguided hubris. Operating platforms is a way to increase efficiencies at getting stuff done, and decrease the operational toil. Platforms should be the foundation of the Digital IT operating model. + +Platform engineering may itself sound like a recombination of DevOps and SRE (this is true to some extent), but differs in it's coverage that practitioners of Platform engineering must command a deeper grasp of the software stack, including but not restricted to security, integration and data. + +So, by those definitions -- you can look at all cloud services (even infrastructure primitives) that are operated by engineering teams as constituting a platform, as long as it serves the needs of one or more product teams. Chances are however that primitive platforms leave a lot to be developed and for an operating model to be efficient, you need to run higher order platforms. + + +This is all easier said than done. Embracing platforms is not without problems. + +1. Enterprise software comes with nuances, feature based differentiation, hiring/skilling challenges and lock-in risks +2. Even cloud platforms come with lock-in risks + +We feel it makes a lot of sense for companies to walk not only the cloud native path – which is that everything in the longer run must be built in the cloud, for the cloud, but also adopt a multi-cloud posture so as to hedge your bets and preserve flexibility towards aligning contracts that meet the bottomline. + +We also have one noteworthy addition, and that is to adopt an open-source first policy. + +This is rooted in solid fundamentals: + +1. OSS rules the roost today. Server/OS market has been won long ago, and particularly since the advent of the cloud. +2. Cloud providers majorly embrace OSS - providing managed services for popular OSS projects +3. A whole class of PaaS software deploys to any cloud (run it yourself model) - supporting use-cases where conventional SaaS cannot be supported +4. Finally, OSS foundations (CNCF, Apache Foundation, amongst others) are providing ecosystems where software development thrives, and are well funded typically by big tech that are invested in these technologies. + +As a digital/tech/IT leader, if you've sought active risk management against vendor lock in and navigating the potential consequences of choosing the wrong tech (particularly proprietary), you will find long term risk management success in the principled approach that we propose. + +We will summarize this as follows: + +1. Choose the best OSS technology catering to your problems and evaluate for project activity, stability and quality of APIs / abstractions +2. Investigate if a (cloud agnostic) PaaS vendor offers it - and can support multi-cloud deployments +3. As a next resort, does your preferred public cloud vendor provide a managed service? +4. As a final resort, does your second preference public cloud vendor provide a managed service? +5. Can you run it yourself? (but combined with tooling - particularly K8S operators?read further) + + In doing so, you will end up choosing best-in-breed (or prospectively best-in-breed) platforms such that even in the worst case scenario, you will have portability and bargaining power with no significant replatforming or migration cost. This is a winning strategy. + +It is important to think deeply through what it means to run platform(s) at central IT scale; There are two main points here: + +1. Almost no platforms are turnkey. It is imminent that you need some extendability; The platform should provide the hooks for it. +2. Ideally, the platform should offer a plugin ecosystem that is community supported. +3. Digital platform IT runs for the sake of product teams: Platforms should offer configurable multi tenancy primitives out of the box, so as for platform teams to provide tooling (ex: Infrastructure as Code, CI/CD pipelines for DevOps automation, starter kits), security, governance and other cross cutting concerns, while relinquishing control to enable product teams to craft their own solutions with full flexibility within the bounds of their tenancy. +4. Based on the DevOps/SRE maturity, enable flexible operations with some shared responsibility model (such as delineating operations of the platform and operations in the platform) that gels well with the culture of the organization + +Even if we hate central IT structures as being inefficient in the longer run, centralization invariably creates short term efficiency centers for shared services atleast at a BU level. + diff --git a/assets/blog-images/50-shades.jpg b/assets/blog-images/50-shades.jpg new file mode 100644 index 0000000000..e048866388 Binary files /dev/null and b/assets/blog-images/50-shades.jpg differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/banner.jpg b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/banner.jpg new file mode 100644 index 0000000000..2726b0c7c1 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/banner.jpg differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1.png new file mode 100644 index 0000000000..b064c8a4b3 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1_1.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1_1.png new file mode 100644 index 0000000000..ff84035804 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image1_1.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image2.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image2.png new file mode 100644 index 0000000000..55570fc116 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image2.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image3.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image3.png new file mode 100644 index 0000000000..df94b425f8 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image3.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image4.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image4.png new file mode 100644 index 0000000000..766813f96f Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image4.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image6.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image6.png new file mode 100644 index 0000000000..85b49844c5 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image6.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image7.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image7.png new file mode 100644 index 0000000000..a3fef0f5c9 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image7.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8.png new file mode 100644 index 0000000000..9b479a7b69 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8.png differ diff --git a/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8_1.png b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8_1.png new file mode 100644 index 0000000000..08215baf50 Binary files /dev/null and b/assets/blog-images/Automating-Kafka-Deployments-with-CICD/image8_1.png differ diff --git a/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF1.png b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF1.png new file mode 100644 index 0000000000..c393398014 Binary files /dev/null and b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF1.png differ diff --git a/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF2.png b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF2.png new file mode 100644 index 0000000000..245f18cb37 Binary files /dev/null and b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/BLOG-GF2.png differ diff --git a/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/title.webp b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/title.webp new file mode 100644 index 0000000000..5e2174a770 Binary files /dev/null and b/assets/blog-images/Learning-by-Debugging-Kafka-Local-Setup/title.webp differ diff --git a/assets/blog-images/Unlocking-the-Potential-of-Local-Kafka/Blog_image.png b/assets/blog-images/Unlocking-the-Potential-of-Local-Kafka/Blog_image.png new file mode 100644 index 0000000000..4263a8dede Binary files /dev/null and b/assets/blog-images/Unlocking-the-Potential-of-Local-Kafka/Blog_image.png differ diff --git a/assets/blog-images/apinomy_blog/ApinomyBlog1.png b/assets/blog-images/apinomy_blog/ApinomyBlog1.png new file mode 100644 index 0000000000..f68009bef4 Binary files /dev/null and b/assets/blog-images/apinomy_blog/ApinomyBlog1.png differ diff --git a/assets/blog-images/apinomy_blog/ApinomyBlog2.png b/assets/blog-images/apinomy_blog/ApinomyBlog2.png new file mode 100644 index 0000000000..9b1faec2af Binary files /dev/null and b/assets/blog-images/apinomy_blog/ApinomyBlog2.png differ diff --git a/assets/blog-images/apinomy_blog/ApinomyBlogTitle.webp b/assets/blog-images/apinomy_blog/ApinomyBlogTitle.webp new file mode 100644 index 0000000000..3ba9163ed0 Binary files /dev/null and b/assets/blog-images/apinomy_blog/ApinomyBlogTitle.webp differ diff --git a/assets/blog-images/auth0-integration_blog/Accounts-and-access.png b/assets/blog-images/auth0-integration_blog/Accounts-and-access.png new file mode 100644 index 0000000000..62d2291e3f Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Accounts-and-access.png differ diff --git a/assets/blog-images/auth0-integration_blog/Application-settings.png b/assets/blog-images/auth0-integration_blog/Application-settings.png new file mode 100644 index 0000000000..c4a7659a86 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Application-settings.png differ diff --git a/assets/blog-images/auth0-integration_blog/Application-type.png b/assets/blog-images/auth0-integration_blog/Application-type.png new file mode 100644 index 0000000000..3fe782c6c3 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Application-type.png differ diff --git a/assets/blog-images/auth0-integration_blog/Athentication vs Athorization.png b/assets/blog-images/auth0-integration_blog/Athentication vs Athorization.png new file mode 100644 index 0000000000..492fea511a Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Athentication vs Athorization.png differ diff --git a/assets/blog-images/auth0-integration_blog/Athentication-vs-Athorization.png b/assets/blog-images/auth0-integration_blog/Athentication-vs-Athorization.png new file mode 100644 index 0000000000..492fea511a Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Athentication-vs-Athorization.png differ diff --git a/assets/blog-images/auth0-integration_blog/Auth0-Idp.png b/assets/blog-images/auth0-integration_blog/Auth0-Idp.png new file mode 100644 index 0000000000..7e74df6c21 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Auth0-Idp.png differ diff --git a/assets/blog-images/auth0-integration_blog/Create-Application.png b/assets/blog-images/auth0-integration_blog/Create-Application.png new file mode 100644 index 0000000000..62e0cbf18c Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Create-Application.png differ diff --git a/assets/blog-images/auth0-integration_blog/Gettting-started.png b/assets/blog-images/auth0-integration_blog/Gettting-started.png new file mode 100644 index 0000000000..0461ccb5fb Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Gettting-started.png differ diff --git a/assets/blog-images/auth0-integration_blog/Identity-pool.png b/assets/blog-images/auth0-integration_blog/Identity-pool.png new file mode 100644 index 0000000000..da0d1421bd Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Identity-pool.png differ diff --git a/assets/blog-images/auth0-integration_blog/Identity-pool2.png b/assets/blog-images/auth0-integration_blog/Identity-pool2.png new file mode 100644 index 0000000000..74184c5f6e Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Identity-pool2.png differ diff --git a/assets/blog-images/auth0-integration_blog/Identity-provider.png b/assets/blog-images/auth0-integration_blog/Identity-provider.png new file mode 100644 index 0000000000..bbcabb0e68 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Identity-provider.png differ diff --git a/assets/blog-images/auth0-integration_blog/Identity-provider2.png b/assets/blog-images/auth0-integration_blog/Identity-provider2.png new file mode 100644 index 0000000000..2c50c9427f Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Identity-provider2.png differ diff --git a/assets/blog-images/auth0-integration_blog/Web_Photo_Editor.jpg b/assets/blog-images/auth0-integration_blog/Web_Photo_Editor.jpg new file mode 100644 index 0000000000..4b1ab4c537 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/Web_Photo_Editor.jpg differ diff --git a/assets/blog-images/auth0-integration_blog/email-main-template_auth0-by-okta-logo_black_279x127_3x.png b/assets/blog-images/auth0-integration_blog/email-main-template_auth0-by-okta-logo_black_279x127_3x.png new file mode 100644 index 0000000000..2f5e538c7b Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/email-main-template_auth0-by-okta-logo_black_279x127_3x.png differ diff --git a/assets/blog-images/auth0-integration_blog/openid-configuration.png b/assets/blog-images/auth0-integration_blog/openid-configuration.png new file mode 100644 index 0000000000..ae268d3bf3 Binary files /dev/null and b/assets/blog-images/auth0-integration_blog/openid-configuration.png differ diff --git a/assets/blog-images/batch_processing_blog/KafkaBatchProcessing.png b/assets/blog-images/batch_processing_blog/KafkaBatchProcessing.png new file mode 100644 index 0000000000..d31c18f837 Binary files /dev/null and b/assets/blog-images/batch_processing_blog/KafkaBatchProcessing.png differ diff --git a/assets/blog-images/batch_processing_blog/airflow_blog_1.png b/assets/blog-images/batch_processing_blog/airflow_blog_1.png new file mode 100644 index 0000000000..3506850f33 Binary files /dev/null and b/assets/blog-images/batch_processing_blog/airflow_blog_1.png differ diff --git a/assets/blog-images/batch_processing_blog/airflow_blog_2.png b/assets/blog-images/batch_processing_blog/airflow_blog_2.png new file mode 100644 index 0000000000..73700ee961 Binary files /dev/null and b/assets/blog-images/batch_processing_blog/airflow_blog_2.png differ diff --git a/assets/blog-images/batch_processing_blog/airflow_blog_3.png b/assets/blog-images/batch_processing_blog/airflow_blog_3.png new file mode 100644 index 0000000000..c1b39d07c8 Binary files /dev/null and b/assets/blog-images/batch_processing_blog/airflow_blog_3.png differ diff --git a/assets/blog-images/batch_processing_blog/airflow_blog_4.png b/assets/blog-images/batch_processing_blog/airflow_blog_4.png new file mode 100644 index 0000000000..d699acb28d Binary files /dev/null and b/assets/blog-images/batch_processing_blog/airflow_blog_4.png differ diff --git a/assets/blog-images/command_processing_blog/Dispatcher_Architecture.png b/assets/blog-images/command_processing_blog/Dispatcher_Architecture.png new file mode 100644 index 0000000000..d1d6d59e63 Binary files /dev/null and b/assets/blog-images/command_processing_blog/Dispatcher_Architecture.png differ diff --git a/assets/blog-images/command_processing_blog/EmailDAG.png b/assets/blog-images/command_processing_blog/EmailDAG.png new file mode 100644 index 0000000000..5462d596a4 Binary files /dev/null and b/assets/blog-images/command_processing_blog/EmailDAG.png differ diff --git a/assets/blog-images/command_processing_blog/EventDrivenArchitecture.png b/assets/blog-images/command_processing_blog/EventDrivenArchitecture.png new file mode 100644 index 0000000000..e50e893258 Binary files /dev/null and b/assets/blog-images/command_processing_blog/EventDrivenArchitecture.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/CreateKSQLinCC.png b/assets/blog-images/e2e_ksql_connect_blog/CreateKSQLinCC.png new file mode 100644 index 0000000000..b9437d2417 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/CreateKSQLinCC.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/CreateKafkaApiKey.png b/assets/blog-images/e2e_ksql_connect_blog/CreateKafkaApiKey.png new file mode 100644 index 0000000000..25013088bf Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/CreateKafkaApiKey.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/InputTopicMessagesCrop.png b/assets/blog-images/e2e_ksql_connect_blog/InputTopicMessagesCrop.png new file mode 100644 index 0000000000..196c035abe Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/InputTopicMessagesCrop.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/KSQLdbCLI.png b/assets/blog-images/e2e_ksql_connect_blog/KSQLdbCLI.png new file mode 100644 index 0000000000..fabea4f345 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/KSQLdbCLI.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/OutputTopicMessagesCrop.png b/assets/blog-images/e2e_ksql_connect_blog/OutputTopicMessagesCrop.png new file mode 100644 index 0000000000..ff5cb90252 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/OutputTopicMessagesCrop.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/PostgresTableOutputCrop.png b/assets/blog-images/e2e_ksql_connect_blog/PostgresTableOutputCrop.png new file mode 100644 index 0000000000..1acf2c5263 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/PostgresTableOutputCrop.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/StreamDesignerPipeline.png b/assets/blog-images/e2e_ksql_connect_blog/StreamDesignerPipeline.png new file mode 100644 index 0000000000..d6b9d645db Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/StreamDesignerPipeline.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/end-to-end-pipeline.jpg b/assets/blog-images/e2e_ksql_connect_blog/end-to-end-pipeline.jpg new file mode 100644 index 0000000000..331fa756cb Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/end-to-end-pipeline.jpg differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/faust_blog_1.png b/assets/blog-images/e2e_ksql_connect_blog/faust_blog_1.png new file mode 100644 index 0000000000..7fb7fddc03 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/faust_blog_1.png differ diff --git a/assets/blog-images/e2e_ksql_connect_blog/streamsgovernanceimage.png b/assets/blog-images/e2e_ksql_connect_blog/streamsgovernanceimage.png new file mode 100644 index 0000000000..c4df991134 Binary files /dev/null and b/assets/blog-images/e2e_ksql_connect_blog/streamsgovernanceimage.png differ diff --git a/assets/blog-images/email-main-template_auth0-by-okta-logo_black_279x127_3x.png b/assets/blog-images/email-main-template_auth0-by-okta-logo_black_279x127_3x.png new file mode 100644 index 0000000000..2f5e538c7b Binary files /dev/null and b/assets/blog-images/email-main-template_auth0-by-okta-logo_black_279x127_3x.png differ diff --git a/assets/blog-images/faust_101_blog/FaustLogo.png b/assets/blog-images/faust_101_blog/FaustLogo.png new file mode 100644 index 0000000000..7b8f35be4a Binary files /dev/null and b/assets/blog-images/faust_101_blog/FaustLogo.png differ diff --git a/assets/blog-images/faust_101_blog/faust_blog_1.png b/assets/blog-images/faust_101_blog/faust_blog_1.png new file mode 100644 index 0000000000..7fb7fddc03 Binary files /dev/null and b/assets/blog-images/faust_101_blog/faust_blog_1.png differ diff --git a/assets/blog-images/faust_101_blog/faust_blog_2.png b/assets/blog-images/faust_101_blog/faust_blog_2.png new file mode 100644 index 0000000000..3d36874792 Binary files /dev/null and b/assets/blog-images/faust_101_blog/faust_blog_2.png differ diff --git a/assets/blog-images/flink_deployment/flink-application.png b/assets/blog-images/flink_deployment/flink-application.png new file mode 100644 index 0000000000..70ce858a89 Binary files /dev/null and b/assets/blog-images/flink_deployment/flink-application.png differ diff --git a/assets/blog-images/flink_deployment/flink_architecture.jpg b/assets/blog-images/flink_deployment/flink_architecture.jpg new file mode 100644 index 0000000000..dd6ec14689 Binary files /dev/null and b/assets/blog-images/flink_deployment/flink_architecture.jpg differ diff --git a/assets/blog-images/kafka-dystopian.png b/assets/blog-images/kafka-dystopian.png new file mode 100644 index 0000000000..0155985244 Binary files /dev/null and b/assets/blog-images/kafka-dystopian.png differ diff --git a/assets/blog-images/kafka-leaderless-future/kafka-dystopian.webp b/assets/blog-images/kafka-leaderless-future/kafka-dystopian.webp new file mode 100644 index 0000000000..0155985244 Binary files /dev/null and b/assets/blog-images/kafka-leaderless-future/kafka-dystopian.webp differ diff --git a/assets/blog-images/kafka-perf-suite/consumer-dashboard-01.png b/assets/blog-images/kafka-perf-suite/consumer-dashboard-01.png new file mode 100644 index 0000000000..4c6c081b5e Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/consumer-dashboard-01.png differ diff --git a/assets/blog-images/kafka-perf-suite/consumer-dashboard-02.png b/assets/blog-images/kafka-perf-suite/consumer-dashboard-02.png new file mode 100644 index 0000000000..0397fa5a9e Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/consumer-dashboard-02.png differ diff --git a/assets/blog-images/kafka-perf-suite/consumer-dashboard-03.png b/assets/blog-images/kafka-perf-suite/consumer-dashboard-03.png new file mode 100644 index 0000000000..49d4f7c0ce Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/consumer-dashboard-03.png differ diff --git a/assets/blog-images/kafka-perf-suite/kafka-benchmark-metrics-featured.png b/assets/blog-images/kafka-perf-suite/kafka-benchmark-metrics-featured.png new file mode 100644 index 0000000000..be94b57a7f Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/kafka-benchmark-metrics-featured.png differ diff --git a/assets/blog-images/kafka-perf-suite/producer-dashboard-01.png b/assets/blog-images/kafka-perf-suite/producer-dashboard-01.png new file mode 100644 index 0000000000..6ec4676df8 Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/producer-dashboard-01.png differ diff --git a/assets/blog-images/kafka-perf-suite/producer-dashboard-02.png b/assets/blog-images/kafka-perf-suite/producer-dashboard-02.png new file mode 100644 index 0000000000..fdd616f73a Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/producer-dashboard-02.png differ diff --git a/assets/blog-images/kafka-perf-suite/producer-dashboard-03.png b/assets/blog-images/kafka-perf-suite/producer-dashboard-03.png new file mode 100644 index 0000000000..e9630a5ad2 Binary files /dev/null and b/assets/blog-images/kafka-perf-suite/producer-dashboard-03.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ClientUsage1.png b/assets/blog-images/kafka_chargeback_blog/ClientUsage1.png new file mode 100644 index 0000000000..9e794b0872 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ClientUsage1.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ClientUsage2.png b/assets/blog-images/kafka_chargeback_blog/ClientUsage2.png new file mode 100644 index 0000000000..d890c1fc89 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ClientUsage2.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ClientUsage3.png b/assets/blog-images/kafka_chargeback_blog/ClientUsage3.png new file mode 100644 index 0000000000..2c2d87bb26 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ClientUsage3.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/CostCentre2.png b/assets/blog-images/kafka_chargeback_blog/CostCentre2.png new file mode 100644 index 0000000000..7dd83ec1e8 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/CostCentre2.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/Cost_Centre1.png b/assets/blog-images/kafka_chargeback_blog/Cost_Centre1.png new file mode 100644 index 0000000000..913e090d79 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/Cost_Centre1.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ResourceUsage1.png b/assets/blog-images/kafka_chargeback_blog/ResourceUsage1.png new file mode 100644 index 0000000000..cae056c265 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ResourceUsage1.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ResourceUsage2.png b/assets/blog-images/kafka_chargeback_blog/ResourceUsage2.png new file mode 100644 index 0000000000..2ab3a75670 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ResourceUsage2.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ResourceUsage3.png b/assets/blog-images/kafka_chargeback_blog/ResourceUsage3.png new file mode 100644 index 0000000000..b7095b14c1 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ResourceUsage3.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ResourceUsage4.png b/assets/blog-images/kafka_chargeback_blog/ResourceUsage4.png new file mode 100644 index 0000000000..61d6c4cdf7 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ResourceUsage4.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/ResourceUsage5.png b/assets/blog-images/kafka_chargeback_blog/ResourceUsage5.png new file mode 100644 index 0000000000..2b406d9fc0 Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/ResourceUsage5.png differ diff --git a/assets/blog-images/kafka_chargeback_blog/TitleImage.jpg b/assets/blog-images/kafka_chargeback_blog/TitleImage.jpg new file mode 100644 index 0000000000..5d5db0daca Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/TitleImage.jpg differ diff --git a/assets/blog-images/kafka_chargeback_blog/kafka-Total-Cost.png b/assets/blog-images/kafka_chargeback_blog/kafka-Total-Cost.png new file mode 100644 index 0000000000..1f82aa897d Binary files /dev/null and b/assets/blog-images/kafka_chargeback_blog/kafka-Total-Cost.png differ diff --git a/assets/blog-images/kafka_service_locator/kafka_service_locator_1.png b/assets/blog-images/kafka_service_locator/kafka_service_locator_1.png new file mode 100644 index 0000000000..250b611e30 Binary files /dev/null and b/assets/blog-images/kafka_service_locator/kafka_service_locator_1.png differ diff --git a/assets/blog-images/librdkafka_interceptors/AuditLibrdkafkaInterceptors.png b/assets/blog-images/librdkafka_interceptors/AuditLibrdkafkaInterceptors.png new file mode 100644 index 0000000000..1767137e5b Binary files /dev/null and b/assets/blog-images/librdkafka_interceptors/AuditLibrdkafkaInterceptors.png differ diff --git a/assets/blog-images/librdkafka_interceptors/KafkaInterceptor.png b/assets/blog-images/librdkafka_interceptors/KafkaInterceptor.png new file mode 100644 index 0000000000..accdef4fdf Binary files /dev/null and b/assets/blog-images/librdkafka_interceptors/KafkaInterceptor.png differ diff --git a/assets/blog-images/librdkafka_interceptors/KafkaInterceptors.png b/assets/blog-images/librdkafka_interceptors/KafkaInterceptors.png new file mode 100644 index 0000000000..e01357905f Binary files /dev/null and b/assets/blog-images/librdkafka_interceptors/KafkaInterceptors.png differ diff --git a/assets/blog-images/mainfesto.jpg b/assets/blog-images/mainfesto.jpg new file mode 100644 index 0000000000..ddbcfce67d Binary files /dev/null and b/assets/blog-images/mainfesto.jpg differ diff --git a/assets/blog-images/migrations/0_TB4m7ZfPMgHF_ljN.png b/assets/blog-images/migrations/0_TB4m7ZfPMgHF_ljN.png new file mode 100644 index 0000000000..2468979155 Binary files /dev/null and b/assets/blog-images/migrations/0_TB4m7ZfPMgHF_ljN.png differ diff --git a/assets/blog-images/migrations/Image2.png b/assets/blog-images/migrations/Image2.png new file mode 100644 index 0000000000..f05f9a1485 Binary files /dev/null and b/assets/blog-images/migrations/Image2.png differ diff --git a/assets/blog-images/migrations/image1.png b/assets/blog-images/migrations/image1.png new file mode 100644 index 0000000000..e6a75ea11a Binary files /dev/null and b/assets/blog-images/migrations/image1.png differ diff --git a/assets/blog-images/oauth-oidc-blog/ClusterAdmin.png b/assets/blog-images/oauth-oidc-blog/ClusterAdmin.png new file mode 100644 index 0000000000..bcc282dd5c Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/ClusterAdmin.png differ diff --git a/assets/blog-images/oauth-oidc-blog/Oauth.jpeg b/assets/blog-images/oauth-oidc-blog/Oauth.jpeg new file mode 100644 index 0000000000..b948f63732 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/Oauth.jpeg differ diff --git a/assets/blog-images/oauth-oidc-blog/Operatorrole.png b/assets/blog-images/oauth-oidc-blog/Operatorrole.png new file mode 100644 index 0000000000..9593ae4d77 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/Operatorrole.png differ diff --git a/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-11-33.png b/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-11-33.png new file mode 100644 index 0000000000..e827292e09 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-11-33.png differ diff --git a/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-18-50.png b/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-18-50.png new file mode 100644 index 0000000000..694d1cd86d Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/Screenshot from 2024-07-24 16-18-50.png differ diff --git a/assets/blog-images/oauth-oidc-blog/application.png b/assets/blog-images/oauth-oidc-blog/application.png new file mode 100644 index 0000000000..5c28b2c857 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/application.png differ diff --git a/assets/blog-images/oauth-oidc-blog/authorization.png b/assets/blog-images/oauth-oidc-blog/authorization.png new file mode 100644 index 0000000000..a0bb511aba Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/authorization.png differ diff --git a/assets/blog-images/oauth-oidc-blog/cctoken.png b/assets/blog-images/oauth-oidc-blog/cctoken.png new file mode 100644 index 0000000000..91671eeb89 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/cctoken.png differ diff --git a/assets/blog-images/oauth-oidc-blog/configurerole.png b/assets/blog-images/oauth-oidc-blog/configurerole.png new file mode 100644 index 0000000000..358b11673f Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/configurerole.png differ diff --git a/assets/blog-images/oauth-oidc-blog/idp.png b/assets/blog-images/oauth-oidc-blog/idp.png new file mode 100644 index 0000000000..4cd339789a Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/idp.png differ diff --git a/assets/blog-images/oauth-oidc-blog/idp2.png b/assets/blog-images/oauth-oidc-blog/idp2.png new file mode 100644 index 0000000000..7c4066632b Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/idp2.png differ diff --git a/assets/blog-images/oauth-oidc-blog/images.txt b/assets/blog-images/oauth-oidc-blog/images.txt new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/assets/blog-images/oauth-oidc-blog/images.txt @@ -0,0 +1 @@ + diff --git a/assets/blog-images/oauth-oidc-blog/oauth.png b/assets/blog-images/oauth-oidc-blog/oauth.png new file mode 100644 index 0000000000..ae4fb4703e Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/oauth.png differ diff --git a/assets/blog-images/oauth-oidc-blog/okta.png b/assets/blog-images/oauth-oidc-blog/okta.png new file mode 100644 index 0000000000..98b0a10278 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/okta.png differ diff --git a/assets/blog-images/oauth-oidc-blog/openid.jpg b/assets/blog-images/oauth-oidc-blog/openid.jpg new file mode 100644 index 0000000000..02f05dca49 Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/openid.jpg differ diff --git a/assets/blog-images/oauth-oidc-blog/pool.png b/assets/blog-images/oauth-oidc-blog/pool.png new file mode 100644 index 0000000000..5d7bed9dff Binary files /dev/null and b/assets/blog-images/oauth-oidc-blog/pool.png differ diff --git a/assets/blog-images/platform-arch-model.svg b/assets/blog-images/platform-arch-model.svg new file mode 100644 index 0000000000..26b5a5d269 --- /dev/null +++ b/assets/blog-images/platform-arch-model.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/blog-images/platform-strategy.svg b/assets/blog-images/platform-strategy.svg new file mode 100644 index 0000000000..1fbffed574 --- /dev/null +++ b/assets/blog-images/platform-strategy.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/assets/blog-images/real_time_retail/RealTimeRetail1.png b/assets/blog-images/real_time_retail/RealTimeRetail1.png new file mode 100644 index 0000000000..29b4f6d0f8 Binary files /dev/null and b/assets/blog-images/real_time_retail/RealTimeRetail1.png differ diff --git a/assets/blog-images/real_time_retail/RealTimeRetail2.png b/assets/blog-images/real_time_retail/RealTimeRetail2.png new file mode 100644 index 0000000000..b0275f6f75 Binary files /dev/null and b/assets/blog-images/real_time_retail/RealTimeRetail2.png differ diff --git a/assets/blog-images/real_time_retail/RealTimeRetail3.png b/assets/blog-images/real_time_retail/RealTimeRetail3.png new file mode 100644 index 0000000000..64c3c5fdc2 Binary files /dev/null and b/assets/blog-images/real_time_retail/RealTimeRetail3.png differ diff --git a/assets/blog-images/real_time_retail/RealTimeRetailTitle.jpg b/assets/blog-images/real_time_retail/RealTimeRetailTitle.jpg new file mode 100644 index 0000000000..05812a13ab Binary files /dev/null and b/assets/blog-images/real_time_retail/RealTimeRetailTitle.jpg differ diff --git a/assets/blog-images/running-kafka-at-scale/architecture.png b/assets/blog-images/running-kafka-at-scale/architecture.png new file mode 100644 index 0000000000..3ed8b5bfe4 Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/architecture.png differ diff --git a/assets/blog-images/running-kafka-at-scale/deploymentmanager.png b/assets/blog-images/running-kafka-at-scale/deploymentmanager.png new file mode 100644 index 0000000000..4566fa18dd Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/deploymentmanager.png differ diff --git a/assets/blog-images/running-kafka-at-scale/graph.png b/assets/blog-images/running-kafka-at-scale/graph.png new file mode 100644 index 0000000000..0d4346c0eb Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/graph.png differ diff --git a/assets/blog-images/running-kafka-at-scale/jmxmonitor.png b/assets/blog-images/running-kafka-at-scale/jmxmonitor.png new file mode 100644 index 0000000000..c8520c6df6 Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/jmxmonitor.png differ diff --git a/assets/blog-images/running-kafka-at-scale/kafka.png b/assets/blog-images/running-kafka-at-scale/kafka.png new file mode 100644 index 0000000000..e43acb96c3 Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/kafka.png differ diff --git a/assets/blog-images/running-kafka-at-scale/logmonitor.png b/assets/blog-images/running-kafka-at-scale/logmonitor.png new file mode 100644 index 0000000000..dceff00cc6 Binary files /dev/null and b/assets/blog-images/running-kafka-at-scale/logmonitor.png differ diff --git a/assets/blog-images/streaming_databases/FunctionsToSQL.png b/assets/blog-images/streaming_databases/FunctionsToSQL.png new file mode 100644 index 0000000000..0d0acc3023 Binary files /dev/null and b/assets/blog-images/streaming_databases/FunctionsToSQL.png differ diff --git a/assets/blog-images/streaming_databases/StreamingDatabase.png b/assets/blog-images/streaming_databases/StreamingDatabase.png new file mode 100644 index 0000000000..69b959680d Binary files /dev/null and b/assets/blog-images/streaming_databases/StreamingDatabase.png differ diff --git a/assets/blog-images/streaming_databases/real_time_spectrum.png b/assets/blog-images/streaming_databases/real_time_spectrum.png new file mode 100644 index 0000000000..55279339d8 Binary files /dev/null and b/assets/blog-images/streaming_databases/real_time_spectrum.png differ diff --git a/assets/blog-images/streaming_databases/streaming_dbs_comparison.png b/assets/blog-images/streaming_databases/streaming_dbs_comparison.png new file mode 100644 index 0000000000..869f3322da Binary files /dev/null and b/assets/blog-images/streaming_databases/streaming_dbs_comparison.png differ diff --git a/assets/blog-images/unified_access_layer/managing-clusters-problem.png b/assets/blog-images/unified_access_layer/managing-clusters-problem.png new file mode 100644 index 0000000000..8c7bcf56c6 Binary files /dev/null and b/assets/blog-images/unified_access_layer/managing-clusters-problem.png differ diff --git a/assets/blog-images/unified_access_layer/rule_based_routing_layer.png b/assets/blog-images/unified_access_layer/rule_based_routing_layer.png new file mode 100644 index 0000000000..250b611e30 Binary files /dev/null and b/assets/blog-images/unified_access_layer/rule_based_routing_layer.png differ diff --git a/assets/blog-images/unified_access_layer/unified_access_layer_1.png b/assets/blog-images/unified_access_layer/unified_access_layer_1.png new file mode 100644 index 0000000000..da5ba6cb42 Binary files /dev/null and b/assets/blog-images/unified_access_layer/unified_access_layer_1.png differ diff --git a/assets/blog-images/wget-log b/assets/blog-images/wget-log new file mode 100644 index 0000000000..ef0cc5902b --- /dev/null +++ b/assets/blog-images/wget-log @@ -0,0 +1,11 @@ +--2023-07-15 17:13:25-- https://pbs.twimg.com/media/ExtabmFXAAMaf0s?format=jpg +Resolving pbs.twimg.com (pbs.twimg.com)... 192.229.237.101 +Connecting to pbs.twimg.com (pbs.twimg.com)|192.229.237.101|:443... connected. +HTTP request sent, awaiting response... 200 OK +Length: 83515 (82K) [image/jpeg] +Saving to: ‘ExtabmFXAAMaf0s?format=jpg’ + + ExtabmFXAAMaf0s?format=jpg 0%[ ] 0 --.-KB/s ExtabmFXAAMaf0s?format=jpg 100%[========================================================================================================================================>] 81.56K --.-KB/s in 0.04s + +2023-07-15 17:13:26 (2.05 MB/s) - ‘ExtabmFXAAMaf0s?format=jpg’ saved [83515/83515] + diff --git a/assets/css/screen.css b/assets/css/screen.css index b306d2cf27..726a6100c9 100644 --- a/assets/css/screen.css +++ b/assets/css/screen.css @@ -7,19 +7,45 @@ License: https://www.wowthemes.net/freebies-license/ html { font-size:18px; } /* Increase the font size on higher resolutions */ .container {max-width:80%;} } -.mainheading { - padding: 1rem 0rem; +.mainheading, .mainheading:hover { + color: inherit; + text-decoration: none; } -a { - color: #00ab6b; +li a { + color: #111111; transition: all 0.2s; + text-decoration: none; +} +.lead h3{ + font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif; } -a:hover { +li a:hover { color: #038252; + /* text-decoration: underline; */ +} +.wrapfooter{ + display: flex; + +} +.wrapfooter .read-btn{ + min-width: 132px; + margin-left: auto; + color: white; + font-size: 1.8em; + font-weight: 500; + background-color: #00ab6b; + padding: 0.4em 0.8em; + /* border-radius: 4px; */ +} +.wrapfooter .read-btn:hover{ + background-color: #039d62; text-decoration: none; } +.wrapfooter .read-btn:hover span{ + margin-left: 5px; +} pre { -moz-box-sizing: border-box; @@ -50,11 +76,17 @@ pre { .site-content { min-height: 60vh; - padding-top: 1.5rem; + padding-top: 3.5rem; margin-top: 57px; transition: all 0.4s; } +@media (max-width:768px) { + .site-content{ + padding-top: 7rem !important; + } +} + section { margin-bottom: 20px; } @@ -105,16 +137,32 @@ section.recent-posts { } .listfeaturedtag { + border: 1px solid rgba(0, 0, 0, .125); border-radius: .25rem; transition: all 0.3s cubic-bezier(.25, .8, .25, 1); } +.listfeaturedtag:hover .card-text, .card:hover .card-text{ + color: white !important; +} +.listfeaturedtag:hover, .post-box .card:hover { + color: white; + background-color: rgb(0, 0, 0); + /* border: 2px solid rgba(0, 0, 0, 0.181) !important ; */ + box-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1); +} + .listfeaturedtag .wrapthumbnail { height: 290px; flex: 0 0 auto; height: 100%; } +.card-body{ + display: flex; + flex-direction: column; + justify-content: center; +} .maxthumb { max-height: 300px; @@ -122,12 +170,13 @@ section.recent-posts { } .listfeaturedtag .card, -.card-footer { + .card-footer, .post-box .card-footer .wrapfooter { border: 0; -} + background-color: inherit; +} .listfeaturedtag .thumbnail { - background-size: cover; + background-size: cover; height: 100%; display: block; background-position: 38% 22% !important; @@ -170,12 +219,19 @@ section.recent-posts { } .wrapfooter { - font-size: .8rem; + font-size: .7rem; display: flex; align-items: center; margin-bottom: 15px; } +.author-thumb-top{ + width: 35px; + height: 35px; + margin:0 13px; + border-radius: 100%; +} + .author-thumb { width: 40px; height: 40px; @@ -184,7 +240,7 @@ section.recent-posts { } .post-top-meta { - margin-bottom: 2rem; + margin-bottom: 1rem; } .post-top-meta .author-thumb { @@ -208,9 +264,34 @@ section.recent-posts { font-size: 0.95rem; } -.toc ul { - list-style: decimal; - font-weight: 400; +.toc-container{ + padding: 0 12px; + border: 2px solid #bcbcbc; + transition: 2s; +} +.toc ul li{ + padding: 4px 0; + +} +.toc ul li a:hover { + color: rgba(0, 0, 0, 0.842); + +} +.toc ul li a { + color: rgba(0, 0, 0, 0.505); + font-size: 1rem; + /* text-decoration: none; */ +} +.toc h3{ + padding-top: 1.5rem; + text-align: center; + font-size: 1em; +} +.toc ul { + list-style: none; + font-weight: 500; + color: #545454; + padding-left: 14px ; } .author-meta { @@ -220,6 +301,14 @@ section.recent-posts { overflow: hidden !important; } +span.post-name{ + font-size: 14px; + font-weight: 600; + text-transform: uppercase; + line-height: 10px; + font-stretch: expanded; + color:#5a5a5a; +} span.post-name, span.post-date, span.author-meta { @@ -304,15 +393,47 @@ span.post-read-more a:hover { margin-right: 5px; } +.mainheading h1.sitetitle span{ + border: 3px solid #bcbcbc; + padding: 0 8px; +} .mainheading h1.sitetitle { - font-family: Righteous; + margin-bottom: 0; + font-weight: 600; + font-family: fangsong; + /* font-family: Merriweather; */ + font-size: 2em; } +.mainheading .lead { + font-size: 0.9em; + text-align: center; +} .mainheading h1.posttitle { font-weight: 700; margin-bottom: 1rem; } +@media (max-width:780px) { + .toc-container{ + margin: 0 12px; + } + .mainheading h1.sitetitle span{ + border: 1px solid #bcbcbc; + padding: 0 4px; + } + .navbar-expand-lg.navbar-light.bg-white.fixed-top> div{ + justify-content: space-evenly; + } + + .mainheading h1.sitetitle{ + font-size: 1.5em !important; + } + .mainheading .lead { + font-size: 0.8em !important; + } +} + .footer { border-top: 1px solid rgba(0, 0, 0, .05) !important; padding-top: 15px; @@ -330,7 +451,7 @@ span.post-read-more a:hover { } .article-post { - font-family: Merriweather; + /* font-family: Merriweather; */ font-size: 1.1rem; line-height: 1.84; color: rgba(0, 0, 0, .8); @@ -352,6 +473,37 @@ blockquote { display: block; margin-bottom: 1.5rem; } +.post-ctas h3{ + padding-top: 1.5rem; + font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif; + font-size: 1em; + color: #212529; +} +.post-ctas { + padding: 0px 10px 10px !important; + border: 2px solid #bcbcbc; + /* background-color: #f1f1f1; */ +} +.post-ctas ul> li { + display: flex !important; + flex-direction: column; + justify-content: space-between; + min-height: 120px; + padding: 6px 0; +} +.post-ctas a { + display: block; + margin-top: 1em; + padding: 10px; + border-radius: 4px; + color: white !important; + background-color: #00ab6b; + font-weight: 600; +} +.post-ctas a:hover{ + /* text-decoration: underline; */ + color: #0000007c; +} .share { text-align: center; @@ -474,8 +626,8 @@ blockquote { .share, .share a { - color: rgba(0, 0, 0, .44); - fill: rgba(0, 0, 0, .44); + color: rgba(0, 0, 0, .505); + fill: rgba(0, 0, 0, .505); } .graybg { @@ -898,4 +1050,8 @@ iframe { .lazyimg[data-srcset] { opacity: 0; transition: opacity .25s; +} + +.navbar-collapse{ + flex-grow: 0 !important; } \ No newline at end of file diff --git a/assets/images/arun.png b/assets/images/arun.png new file mode 100644 index 0000000000..0f7d5e6dc9 Binary files /dev/null and b/assets/images/arun.png differ diff --git a/assets/images/jumbotron.jpg b/assets/images/jumbotron.jpg index 48d16d876e..6d4eaca6f4 100644 Binary files a/assets/images/jumbotron.jpg and b/assets/images/jumbotron.jpg differ diff --git a/assets/images/jumbotron3.jpg b/assets/images/jumbotron3.jpg new file mode 100644 index 0000000000..48d16d876e Binary files /dev/null and b/assets/images/jumbotron3.jpg differ diff --git a/assets/images/lakshmi.jpg b/assets/images/lakshmi.jpg new file mode 100644 index 0000000000..03cac20e90 Binary files /dev/null and b/assets/images/lakshmi.jpg differ diff --git a/assets/images/plf-logo.svg b/assets/images/plf-logo.svg new file mode 100644 index 0000000000..5c1454747c --- /dev/null +++ b/assets/images/plf-logo.svg @@ -0,0 +1,316 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/images/subramanya.jpg b/assets/images/subramanya.jpg new file mode 100644 index 0000000000..7c9b61e997 Binary files /dev/null and b/assets/images/subramanya.jpg differ diff --git a/assets/images/vikhyat.jpg b/assets/images/vikhyat.jpg new file mode 100644 index 0000000000..974290ec5b Binary files /dev/null and b/assets/images/vikhyat.jpg differ diff --git a/assets/images/vikhyat_.jpg b/assets/images/vikhyat_.jpg new file mode 100644 index 0000000000..974290ec5b Binary files /dev/null and b/assets/images/vikhyat_.jpg differ diff --git a/docker-compose.yml b/docker-compose.yml index 6feb3627ef..6900ecdddc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,8 @@ -jekyll: +services: + jekyll: image: jekyll/jekyll:latest - command: jekyll serve --force_polling + command: jekyll serve --force_polling --trace ports: - - 4000:4000 + - 4000:4000 volumes: - - .:/srv/jekyll \ No newline at end of file + - .:/srv/jekyll diff --git a/index.html b/index.html index fcf06c0e08..e1e0e505ed 100644 --- a/index.html +++ b/index.html @@ -34,7 +34,7 @@

    Featured

    -

    All Stories

    +

    All Posts