@@ -131,6 +131,10 @@ def __post_init__(self):
131131class UnknownColType (ColType ):
132132 text : str
133133
134+ def __post_init__ (self ):
135+ logger .warn (f"Column of type '{ self .text } ' has no compatibility handling. "
136+ "If encoding/formatting differs between databases, it may result in false positives." )
137+
134138
135139class AbstractDatabase (ABC ):
136140 @abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173177 "Close connection(s) to the database instance. Querying will stop functioning."
174178 ...
175179
180+
176181 @abstractmethod
177- def normalize_value_by_type ( value : str , coltype : ColType ) -> str :
178- """Creates an SQL expression, that converts 'value' to a normalized representation .
182+ def normalize_timestamp ( self , value : str , coltype : ColType ) -> str :
183+ """Creates an SQL expression, that converts 'value' to a normalized timestamp .
179184
180- The returned expression must accept any SQL value, and return a string.
185+ The returned expression must accept any SQL datetime/timestamp, and return a string.
186+
187+ Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188+
189+ Precision of dates should be rounded up/down according to coltype.rounds
190+ """
191+ ...
181192
182- - Dates are expected in the format:
183- "YYYY-MM-DD HH:mm:SS.FFFFFF"
193+ @abstractmethod
194+ def normalize_number (self , value : str , coltype : ColType ) -> str :
195+ """Creates an SQL expression, that converts 'value' to a normalized number.
184196
185- Rounded up/down according to coltype.rounds
197+ The returned expression must accept any SQL int/numeric/float, and return a string.
186198
187199 - Floats/Decimals are expected in the format
188200 "I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191203 and must be at least one digit (0).
192204 P is the fractional digits, the amount of which is specified with
193205 coltype.precision. Trailing zeroes may be necessary.
206+ If P is 0, the dot is omitted.
194207
195208 Note: This precision is different than the one used by databases. For decimals,
196- it's the same as "numeric_scale", and for floats, who use binary precision,
197- it can be calculated as log10(2**p)
209+ it's the same as ``numeric_scale``, and for floats, who use binary precision,
210+ it can be calculated as ``log10(2**numeric_precision)``.
211+ """
212+ ...
213+
214+ def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
215+ """Creates an SQL expression, that converts 'value' to a normalized representation.
216+
217+ The returned expression must accept any SQL value, and return a string.
218+
219+ The default implementation dispatches to a method according to ``coltype``:
198220
221+ TemporalType -> normalize_timestamp()
222+ NumericType -> normalize_number()
223+ -else- -> to_string()
199224
200225 """
201- ...
226+ if isinstance (coltype , TemporalType ):
227+ return self .normalize_timestamp (value , coltype )
228+ elif isinstance (coltype , NumericType ):
229+ return self .normalize_number (value , coltype )
230+ return self .to_string (f"{ value } " )
202231
203232
204233class Database (AbstractDatabase ):
@@ -404,27 +433,16 @@ def md5_to_int(self, s: str) -> str:
404433 def to_string (self , s : str ):
405434 return f"{ s } ::varchar"
406435
407- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
408- if isinstance (coltype , TemporalType ):
409- # if coltype.precision == 0:
410- # return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
411- # if coltype.precision == 3:
412- # return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
413- # elif coltype.precision == 6:
414- # return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
415- # else:
416- # # Postgres/Redshift doesn't support arbitrary precision
417- # raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
418- if coltype .rounds :
419- return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
420- else :
421- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
422- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
423436
424- elif isinstance (coltype , NumericType ):
425- value = f"{ value } ::decimal(38, { coltype .precision } )"
437+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
438+ if coltype .rounds :
439+ return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
426440
427- return self .to_string (f"{ value } " )
441+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
442+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
443+
444+ def normalize_number (self , value : str , coltype : ColType ) -> str :
445+ return self .to_string (f"{ value } ::decimal(38, { coltype .precision } )" )
428446
429447
430448class Presto (Database ):
@@ -463,25 +481,19 @@ def _query(self, sql_code: str) -> list:
463481 def close (self ):
464482 self ._conn .close ()
465483
466- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
467- if isinstance (coltype , TemporalType ):
468- if coltype .rounds :
469- if coltype .precision > 3 :
470- pass
471- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
472- else :
473- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
474- # datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
475- # datetime = self.to_string(f"cast({value} as datetime(6))")
484+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
485+ # TODO
486+ if coltype .rounds :
487+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
488+ else :
489+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
476490
477- return (
478- f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
479- )
480-
481- elif isinstance (coltype , NumericType ):
482- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
491+ return (
492+ f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
493+ )
483494
484- return self .to_string (value )
495+ def normalize_number (self , value : str , coltype : ColType ) -> str :
496+ return self .to_string (f"cast({ value } as decimal(38,{ coltype .precision } ))" )
485497
486498 def select_table_schema (self , path : DbPath ) -> str :
487499 schema , table = self ._normalize_table_path (path )
@@ -566,18 +578,16 @@ def md5_to_int(self, s: str) -> str:
566578 def to_string (self , s : str ):
567579 return f"cast({ s } as char)"
568580
569- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
570- if isinstance (coltype , TemporalType ):
571- if coltype .rounds :
572- return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
573- else :
574- s = self .to_string (f"cast({ value } as datetime(6))" )
575- return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
581+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
582+ if coltype .rounds :
583+ return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
576584
577- elif isinstance (coltype , NumericType ):
578- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
585+ s = self .to_string (f"cast({ value } as datetime(6))" )
586+ return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
587+
588+ def normalize_number (self , value : str , coltype : ColType ) -> str :
589+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
579590
580- return self .to_string (f"{ value } " )
581591
582592
583593class Oracle (ThreadedDatabase ):
@@ -622,16 +632,15 @@ def select_table_schema(self, path: DbPath) -> str:
622632 f" FROM USER_TAB_COLUMNS WHERE table_name = '{ table .upper ()} '"
623633 )
624634
625- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
626- if isinstance (coltype , TemporalType ):
627- return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
628- elif isinstance (coltype , NumericType ):
629- # FM999.9990
630- format_str = "FM" + "9" * (38 - coltype .precision )
631- if coltype .precision :
632- format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
633- return f"to_char({ value } , '{ format_str } ')"
634- return self .to_string (f"{ value } " )
635+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
636+ return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
637+
638+ def normalize_number (self , value : str , coltype : ColType ) -> str :
639+ # FM999.9990
640+ format_str = "FM" + "9" * (38 - coltype .precision )
641+ if coltype .precision :
642+ format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
643+ return f"to_char({ value } , '{ format_str } ')"
635644
636645 def _parse_type (
637646 self , type_repr : str , datetime_precision : int = None , numeric_precision : int = None , numeric_scale : int = None
@@ -682,27 +691,25 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
682691 def md5_to_int (self , s : str ) -> str :
683692 return f"strtol(substring(md5({ s } ), { 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS } ), 16)::decimal(38)"
684693
685- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
686- if isinstance (coltype , TemporalType ):
687- if coltype .rounds :
688- timestamp = f"{ value } ::timestamp(6)"
689- # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
690- secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
691- # Get the milliseconds from timestamp.
692- ms = f"extract(ms from { timestamp } )"
693- # Get the microseconds from timestamp, without the milliseconds!
694- us = f"extract(us from { timestamp } )"
695- # epoch = Total time since epoch in microseconds.
696- epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
697- timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
698- else :
699- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
700- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
694+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
695+ if coltype .rounds :
696+ timestamp = f"{ value } ::timestamp(6)"
697+ # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
698+ secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
699+ # Get the milliseconds from timestamp.
700+ ms = f"extract(ms from { timestamp } )"
701+ # Get the microseconds from timestamp, without the milliseconds!
702+ us = f"extract(us from { timestamp } )"
703+ # epoch = Total time since epoch in microseconds.
704+ epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
705+ timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
706+ else :
707+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
708+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
701709
702- elif isinstance ( coltype , NumericType ) :
703- value = f"{ value } ::decimal(38,{ coltype .precision } )"
710+ def normalize_number ( self , value : str , coltype : ColType ) -> str :
711+ return self . to_string ( f"{ value } ::decimal(38,{ coltype .precision } )" )
704712
705- return self .to_string (f"{ value } " )
706713
707714
708715class MsSQL (ThreadedDatabase ):
@@ -794,25 +801,22 @@ def select_table_schema(self, path: DbPath) -> str:
794801 f"WHERE table_name = '{ table } ' AND table_schema = '{ schema } '"
795802 )
796803
797- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
798- if isinstance (coltype , TemporalType ):
799- if coltype .rounds :
800- timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
801- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
802- else :
803- if coltype .precision == 0 :
804- return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
805- elif coltype .precision == 6 :
806- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
804+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
805+ if coltype .rounds :
806+ timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
807+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
807808
808- timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
809- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
809+ if coltype .precision == 0 :
810+ return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
811+ elif coltype .precision == 6 :
812+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
810813
811- elif isinstance (coltype , NumericType ):
812- # value = f"cast({value} as decimal)"
813- return f"format('%.{ coltype .precision } f', cast({ value } as decimal))"
814+ timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
815+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
814816
815- return self .to_string (f"{ value } " )
817+ def normalize_number (self , value : str , coltype : ColType ) -> str :
818+ # value = f"cast({value} as decimal)"
819+ return f"format('%.{ coltype .precision } f', cast({ value } as decimal))"
816820
817821 def parse_table_name (self , name : str ) -> DbPath :
818822 path = parse_table_name (name )
@@ -886,19 +890,16 @@ def select_table_schema(self, path: DbPath) -> str:
886890 schema , table = self ._normalize_table_path (path )
887891 return super ().select_table_schema ((schema , table ))
888892
889- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
890- if isinstance (coltype , TemporalType ):
891- if coltype .rounds :
892- timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
893- else :
894- timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
893+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
894+ if coltype .rounds :
895+ timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
896+ else :
897+ timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
895898
896- return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
899+ return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
897900
898- elif isinstance (coltype , NumericType ):
899- value = f"cast({ value } as decimal(38, { coltype .precision } ))"
900-
901- return self .to_string (f"{ value } " )
901+ def normalize_number (self , value : str , coltype : ColType ) -> str :
902+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
902903
903904
904905@dataclass
0 commit comments