Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

add_configured_to_source_lists.py 7.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
  1. #!/usr/bin/env python3
  2. import argparse
  3. import logging
  4. import os
  5. import numpy as np
  6. import pandas as pd
  7. import astropy.table
  8. from astropy.table import Table
  9. from swgworkflow.xmlanalysis import parse_configured_xmls
  10. def _match_masked_columns(col1, col2, mask1=None, mask2=None):
  11. if mask1 is None:
  12. mask1 = np.zeros(col1.shape, dtype=bool)
  13. if mask2 is None:
  14. mask2 = np.zeros(col2.shape, dtype=bool)
  15. good_col1_values = col1[~mask1]
  16. good_col1_idx = np.nonzero(~mask1)[0]
  17. good_col2_values = col2[~mask2]
  18. good_col2_idx = np.nonzero(~mask2)[0]
  19. _, col1_ind_unmasked, col2_ind_unmasked = np.intersect1d(
  20. good_col1_values, good_col2_values, return_indices=True)
  21. col1_idx = good_col1_idx[col1_ind_unmasked]
  22. col2_idx = good_col2_idx[col2_ind_unmasked]
  23. return col1_idx, col2_idx
  24. def _unmask_column(table, column):
  25. if hasattr(table[column], 'mask'):
  26. table[column].fill_value = 0
  27. unmasked_column = table[column].filled()
  28. mask = table[column].mask
  29. else:
  30. unmasked_column = table[column]
  31. mask = None
  32. return unmasked_column, mask
  33. def _match_source_to_target_lists(target_list, target_column_name, source_list,
  34. source_column_name, source_list_mask=None):
  35. target_column, target_mask = _unmask_column(target_list, target_column_name)
  36. source_column, source_mask = _unmask_column(source_list, source_column_name)
  37. target_ind, source_ind = _match_masked_columns(
  38. col1=target_column, col2=source_column,
  39. mask1=target_mask, mask2=source_mask)
  40. return target_ind, source_ind
  41. def _get_output_filename(source_file, output_dir,
  42. suffix='-configured',
  43. extension='.fits'):
  44. input_basename_wo_ext = os.path.splitext(os.path.basename(source_file))[0]
  45. output_basename_wo_ext = input_basename_wo_ext + suffix
  46. output_file = os.path.join(output_dir, output_basename_wo_ext + extension)
  47. return output_file
  48. def _check_output_file(output_file, overwrite=False):
  49. # If the output file already exists, delete it if overwrite, if not
  50. # return false
  51. if os.path.exists(output_file):
  52. if overwrite == True:
  53. logging.info('Removing previous file: {}'.format(output_file))
  54. os.remove(output_file)
  55. return True
  56. else:
  57. logging.info(
  58. 'Skipping {} as it already exists.'.format(output_file))
  59. return False
  60. return True
  61. def add_columns_to_source_list(source_file, target_cats, output_dir,
  62. new_columns, default_values, suffix,
  63. overwrite=False):
  64. output_file = _get_output_filename(source_file, output_dir, suffix=suffix)
  65. # If the output file already exists, delete it or continue with the next
  66. # one
  67. if not _check_output_file(output_file, overwrite):
  68. return
  69. # Read the source list and add our new columns
  70. source_list = Table.read(source_file)
  71. for column, default in zip(new_columns, default_values):
  72. source_list[column] = default
  73. for target_cat in target_cats:
  74. target_list = Table.read(target_cat)
  75. # Check the requested columns actually exist in the target catalogue
  76. for column in new_columns:
  77. assert column in target_list.columns, \
  78. "Didn't find {} in {}. ".format(column, target_cat)
  79. # Get indexes of rows where GAIA_ID matches
  80. # We need to match only the good, non-masked entries in the target list
  81. target_column_name, source_column_name = 'GAIA_ID', 'SOURCE_ID'
  82. source_list_mask = (source_list['GAIA_REV_ID'] != 0)
  83. target_ind_gaia, source_ind_gaia = _match_source_to_target_lists(
  84. target_list, target_column_name, source_list, source_column_name,
  85. source_list_mask=source_list_mask)
  86. # Append indexes of rows where there is no GAIA_ID in source list,
  87. # but PS_ID matches a target in the target list
  88. target_column_name, source_column_name = 'PS_ID', 'PS1_ID'
  89. source_list_mask = (source_list['GAIA_REV_ID'] == 0)
  90. target_ind_ps, source_ind_ps = _match_source_to_target_lists(
  91. target_list, target_column_name, source_list, source_column_name,
  92. source_list_mask=source_list_mask)
  93. target_ind = np.append(target_ind_gaia, target_ind_ps)
  94. source_ind = np.append(source_ind_gaia, source_ind_ps)
  95. # Assert that the source list has the default values and copy across
  96. for column, default in zip(new_columns, default_values):
  97. if np.any(source_list[column][source_ind] != default):
  98. msg = "Found ambiguous matches from catalogue {} to source " \
  99. "list {}. This may happen if two surveys target the " \
  100. "same object.".format(target_cat,source_file)
  101. logging.warning(msg)
  102. source_list[column][source_ind] = target_list[column][
  103. target_ind]
  104. source_list.write(output_file)
  105. return output_file
  106. if __name__ == '__main__':
  107. parser = argparse.ArgumentParser(
  108. description='After configuring xml files add this information back to '
  109. 'the catalogues')
  110. parser.add_argument('source_file', nargs='+',
  111. help="""One or more source lists""")
  112. parser.add_argument('--catalogues', nargs='+',
  113. help="""Catalogues containing targets""")
  114. parser.add_argument('--outdir', dest='output_dir', default='output',
  115. help="""name of the directory which will contain the
  116. output source lists""")
  117. parser.add_argument('--suffix', default='-configured',
  118. help="""suffix to add to the source lists""")
  119. parser.add_argument('--overwrite', action='store_true',
  120. help='overwrite the output files')
  121. parser.add_argument('--log_level', default='info',
  122. choices=['debug', 'info', 'warning', 'error'],
  123. help='the level for the logging messages')
  124. args = parser.parse_args()
  125. logging.basicConfig(level=getattr(logging, args.log_level.upper()))
  126. if not os.path.exists(args.output_dir):
  127. logging.info('Creating the output directory')
  128. os.makedirs(args.output_dir, exist_ok=True)
  129. new_columns = ('GA_TARGBITS', 'TARGPROG', 'TARGPRIO', 'CONFIGURED',
  130. 'ASSIGNED')
  131. default_values = (0, 40*' ', 0.0, 0, 0)
  132. for source_file in args.source_file:
  133. # Clean after adding the last catalogue if we were asked to
  134. add_columns_to_source_list(source_file=source_file,
  135. target_cats=args.catalogues,
  136. new_columns=new_columns,
  137. default_values=default_values,
  138. suffix=args.suffix,
  139. output_dir=args.output_dir,
  140. overwrite=args.overwrite)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...