@inproceedings{kumar2022probing, author = {Kumar, Abhinav and Tan, Chenhao and Sharma, Amit}, title = {Probing Classifiers are Unreliable for Concept Removal and Detection}, booktitle = {2022 Neural Information Processing Systems}, year = {2022}, month = {October}, abstract = {Neural network models trained on text data have been found to encode undesired linguistic or sensitive attributes in their representation. Removing such attributes is non-trivial because of a complex relationship between the attribute, text input, and the learnt representation. Recent work has proposed post-hoc and adversarial methods to remove such unwanted attributes from a model's representation. Through an extensive theoretical and empirical analysis, we show that these methods can be counter-productive: they are unable to remove the attributes entirely, and in the worst case may end up destroying all task-relevant features. The reason is the methods' reliance on a probing classifier as a proxy for the attribute. Even under the most favorable conditions when an attribute's features in representation space can alone provide 100% accuracy for learning the probing classifier, we prove that post-hoc or adversarial methods will fail to remove the attribute correctly. These theoretical implications are confirmed by empirical experiments on models trained on synthetic, Multi-NLI, and Twitter datasets. For sensitive applications of attribute removal such as fairness, we recommend caution against using these methods and propose a spuriousness metric to gauge the quality of the final classifier.}, url = {http://approjects.co.za/?big=en-us/research/publication/probing-classifiers-are-unreliable-for-concept-removal-and-detection/}, }