44//! tracking availability of the underlying database
55
66use deadpool:: managed:: { Hook , RecycleError , RecycleResult } ;
7- use diesel:: { r2d2 , IntoSql } ;
7+ use diesel:: IntoSql ;
88
99use diesel_async:: pooled_connection:: { PoolError as DieselPoolError , PoolableConnection } ;
1010use diesel_async:: { AsyncConnection , RunQueryDsl } ;
@@ -18,7 +18,6 @@ use graph::slog::info;
1818use graph:: slog:: Logger ;
1919
2020use std:: collections:: HashMap ;
21- use std:: fmt;
2221use std:: sync:: atomic:: AtomicBool ;
2322use std:: sync:: atomic:: Ordering ;
2423use std:: sync:: Arc ;
@@ -30,13 +29,63 @@ use crate::pool::AsyncPool;
3029/// Our own connection manager. It is pretty much the same as
3130/// `AsyncDieselConnectionManager` but makes it easier to instrument and
3231/// track connection errors
32+ #[ derive( Clone ) ]
3333pub struct ConnectionManager {
34+ logger : Logger ,
3435 connection_url : String ,
36+ state_tracker : StateTracker ,
37+ error_counter : Counter ,
3538}
3639
3740impl ConnectionManager {
38- pub ( super ) fn new ( connection_url : String ) -> Self {
39- Self { connection_url }
41+ pub ( super ) fn new (
42+ logger : Logger ,
43+ connection_url : String ,
44+ state_tracker : StateTracker ,
45+ registry : & MetricsRegistry ,
46+ const_labels : HashMap < String , String > ,
47+ ) -> Self {
48+ let error_counter = registry
49+ . global_counter (
50+ "store_connection_error_count" ,
51+ "The number of Postgres connections errors" ,
52+ const_labels,
53+ )
54+ . expect ( "failed to create `store_connection_error_count` counter" ) ;
55+
56+ Self {
57+ logger,
58+ connection_url,
59+ state_tracker,
60+ error_counter,
61+ }
62+ }
63+
64+ fn handle_error ( & self , error : & dyn std:: error:: Error ) {
65+ let msg = brief_error_msg ( & error) ;
66+
67+ // Don't count canceling statements for timeouts etc. as a
68+ // connection error. Unfortunately, we only have the textual error
69+ // and need to infer whether the error indicates that the database
70+ // is down or if something else happened. When querying a replica,
71+ // these messages indicate that a query was canceled because it
72+ // conflicted with replication, but does not indicate that there is
73+ // a problem with the database itself.
74+ //
75+ // This check will break if users run Postgres (or even graph-node)
76+ // in a locale other than English. In that case, their database will
77+ // be marked as unavailable even though it is perfectly fine.
78+ if msg. contains ( "canceling statement" )
79+ || msg. contains ( "terminating connection due to conflict with recovery" )
80+ {
81+ return ;
82+ }
83+
84+ self . error_counter . inc ( ) ;
85+ if self . state_tracker . is_available ( ) {
86+ error ! ( self . logger, "Connection checkout" ; "error" => msg) ;
87+ }
88+ self . state_tracker . mark_unavailable ( Duration :: from_secs ( 0 ) ) ;
4089 }
4190}
4291
@@ -46,9 +95,11 @@ impl deadpool::managed::Manager for ConnectionManager {
4695 type Error = DieselPoolError ;
4796
4897 async fn create ( & self ) -> Result < Self :: Type , Self :: Error > {
49- diesel_async:: AsyncPgConnection :: establish ( & self . connection_url )
50- . await
51- . map_err ( DieselPoolError :: ConnectionError )
98+ let res = diesel_async:: AsyncPgConnection :: establish ( & self . connection_url ) . await ;
99+ if let Err ( ref e) = res {
100+ self . handle_error ( e) ;
101+ }
102+ res. map_err ( DieselPoolError :: ConnectionError )
52103 }
53104
54105 async fn recycle (
@@ -59,11 +110,14 @@ impl deadpool::managed::Manager for ConnectionManager {
59110 if std:: thread:: panicking ( ) || obj. is_broken ( ) {
60111 return Err ( RecycleError :: Message ( "Broken connection" . into ( ) ) ) ;
61112 }
62- diesel:: select ( 67_i32 . into_sql :: < diesel:: sql_types:: Integer > ( ) )
113+ let res = diesel:: select ( 67_i32 . into_sql :: < diesel:: sql_types:: Integer > ( ) )
63114 . execute ( obj)
64115 . await
65- . map ( |_| ( ) )
66- . map_err ( DieselPoolError :: QueryError ) ?;
116+ . map ( |_| ( ) ) ;
117+ if let Err ( ref e) = res {
118+ self . handle_error ( e) ;
119+ }
120+ res. map_err ( DieselPoolError :: QueryError ) ?;
67121 Ok ( ( ) )
68122 }
69123}
@@ -139,57 +193,6 @@ impl StateTracker {
139193 }
140194}
141195
142- #[ derive( Clone ) ]
143- pub ( super ) struct ErrorHandler {
144- logger : Logger ,
145- counter : Counter ,
146- state_tracker : StateTracker ,
147- }
148-
149- impl ErrorHandler {
150- pub ( super ) fn new ( logger : Logger , counter : Counter , state_tracker : StateTracker ) -> Self {
151- Self {
152- logger,
153- counter,
154- state_tracker,
155- }
156- }
157- }
158- impl std:: fmt:: Debug for ErrorHandler {
159- fn fmt ( & self , _f : & mut fmt:: Formatter ) -> fmt:: Result {
160- fmt:: Result :: Ok ( ( ) )
161- }
162- }
163-
164- impl r2d2:: HandleError < r2d2:: Error > for ErrorHandler {
165- fn handle_error ( & self , error : r2d2:: Error ) {
166- let msg = brief_error_msg ( & error) ;
167-
168- // Don't count canceling statements for timeouts etc. as a
169- // connection error. Unfortunately, we only have the textual error
170- // and need to infer whether the error indicates that the database
171- // is down or if something else happened. When querying a replica,
172- // these messages indicate that a query was canceled because it
173- // conflicted with replication, but does not indicate that there is
174- // a problem with the database itself.
175- //
176- // This check will break if users run Postgres (or even graph-node)
177- // in a locale other than English. In that case, their database will
178- // be marked as unavailable even though it is perfectly fine.
179- if msg. contains ( "canceling statement" )
180- || msg. contains ( "terminating connection due to conflict with recovery" )
181- {
182- return ;
183- }
184-
185- self . counter . inc ( ) ;
186- if self . state_tracker . is_available ( ) {
187- error ! ( self . logger, "Postgres connection error" ; "error" => msg) ;
188- }
189- self . state_tracker . mark_unavailable ( Duration :: from_secs ( 0 ) ) ;
190- }
191- }
192-
193196fn brief_error_msg ( error : & dyn std:: error:: Error ) -> String {
194197 // For 'Connection refused' errors, Postgres includes the IP and
195198 // port number in the error message. We want to suppress that and
0 commit comments